In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd

In [2]:
df = pd.read_excel("../data/processed/exploratory_analysis_final.xlsx")

In [3]:
# Replace missing QS rank with a low prestige value (e.g., rank 1200)
df['qs_rank'] = df['qs_rank'].fillna(1200)
df['qs_rank_score'] = -df['qs_rank']

In [4]:
# === Final Feature List ===
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing',
    'gre_total',
    'gpa_percentile',
    'gre_avg',
    'gpa_x_acceptancerate',
    'application_strength',
    'qs_rank',
    'qs_rank_score',
    'qs_tier',
    'tier_score',
    'program',
    'degree_type',
    'institution'
]


In [5]:
df['decision_grouped'].value_counts()

decision_grouped
0    37245
1    35986
2    27390
Name: count, dtype: int64

In [6]:
X = df[feature_cols]
y = df['decision_grouped']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [9]:
clf = RandomForestClassifier(random_state=42)

In [10]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [11]:
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_sm, y_train_sm)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [12]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [13]:
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.5194534161490684
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.57      0.58      7440
           1       0.52      0.54      0.53      7095
           2       0.44      0.42      0.43      5590

    accuracy                           0.52     20125
   macro avg       0.51      0.51      0.51     20125
weighted avg       0.52      0.52      0.52     20125

Confusion Matrix:
 [[4250 1695 1495]
 [1680 3840 1575]
 [1411 1815 2364]]
