In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd

In [23]:
df = pd.read_excel("../data/processed/exploratory_analysis_final.xlsx")

In [24]:
# Replace missing QS rank with a low prestige value (e.g., rank 1200)
df['qs_rank'] = df['qs_rank'].fillna(1200)
df['qs_rank_score'] = -df['qs_rank']

In [25]:
# === Final Feature List ===
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing',
    'gre_total',
    'gpa_percentile',
    'gre_avg',
    'gpa_x_acceptancerate',
    'application_strength',
    'qs_rank',
    'qs_rank_score',
    'qs_tier',
    'tier_score',
    'program',
    'degree_type',
    'institution'
]


In [26]:
X = df[feature_cols]
y = df['decision_grouped']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train_sm, y_train_sm)

y_pred = xgb_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.5337639751552795
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.59      0.59      7440
           1       0.53      0.56      0.54      7095
           2       0.46      0.42      0.44      5590

    accuracy                           0.53     20125
   macro avg       0.53      0.52      0.53     20125
weighted avg       0.53      0.53      0.53     20125

Confusion Matrix:
 [[4387 1740 1313]
 [1698 4000 1397]
 [1368 1867 2355]]
