In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier


In [None]:
# Load your dataset
df = pd.read_excel('../data/processed/exploratory_analysis_final.xlsx', sheet_name='Sheet1')


FileNotFoundError: [Errno 2] No such file or directory: 'exploratory_analysis_final.xlsx'

In [None]:
# Interaction features
df['gpa_gre_interaction'] = df['undergrad_gpa'] * df['gre_avg']
df['gpa_acceptance_interaction'] = df['undergrad_gpa'] * df['acceptance_rate']

# Composite profile strength
df['profile_strength'] = df['gre_avg'] * df['gpa_percentile'] * (1 - df['acceptance_rate'])


In [None]:

# Rank bucketing
def rank_bucket(score):
    if pd.isna(score):
        return 'Unknown'
    elif score <= -10:
        return 'Top 10'
    elif score <= -50:
        return 'Top 50'
    elif score <= -100:
        return 'Top 100'
    else:
        return 'Other'


In [None]:
df['qs_rank_bucket'] = df['qs_rank_score'].apply(rank_bucket)

# Update feature list
features = [
    'undergrad_gpa', 'gre_quantitative_reasoning', 'gre_verbal_reasoning',
    'analytical_writing', 'acceptance_rate', 'qs_rank_score', 'qs_tier',
    'gpa_percentile', 'gre_avg', 'gpa_x_acceptancerate',
    'gpa_gre_interaction', 'gpa_acceptance_interaction',
    'profile_strength', 'qs_rank_bucket'
]


In [None]:
# Create binary target: 1 if admitted, 0 otherwise
df['binary_decision'] = df['decision_grouped'].apply(lambda x: 1 if x == 1 else 0)


In [None]:
X = df[features]
y = df['binary_decision']


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [None]:

# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define the pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])


In [None]:
# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 6, 10],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__subsample': [0.8, 1],
    'classifier__colsample_bytree': [0.8, 1]
}


In [None]:
grid_search = GridSearchCV(xgb_pipeline, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [None]:
# Best model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [None]:
print("Best Parameters:", grid_search.best_params_)
print(f'Accuracy: {accuracy:.4f}')
print(report)
