In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as imb_make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns


In [22]:
df = pd.read_excel("../data/processed/exploratory_analysis_final.xlsx")
df['qs_rank'] = df['qs_rank'].fillna(1200)
df['qs_rank_score'] = -df['qs_rank']


In [23]:
# Define a better feature set based on feature selection
# This is just a preliminary set - adjust based on results from feature selection
optimal_features = [
    'undergrad_gpa', 
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing',
    'acceptance_rate', 
    'application_strength',  # This seems to be an important engineered feature
    'qs_rank_score',         # Using the transformed version
    'program',               # Categorical feature
    'degree_type',           # Categorical feature
    'institution'            # Categorical feature
]

In [24]:
# Split data into features and target
X = df[optimal_features]
y = df['decision_grouped']


In [25]:
# Identify numerical and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [26]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


In [27]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [28]:
# Function to train and evaluate a model
def train_evaluate_model(name, model, X_train, y_train, X_test, y_test, use_smote=True):
    if use_smote:
        pipeline = imb_make_pipeline(
            preprocessor,
            SMOTE(random_state=42),
            model
        )
    else:
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{class_report}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Reject', 'Waitlist', 'Accept'],
                yticklabels=['Reject', 'Waitlist', 'Accept'])
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    return pipeline, accuracy

In [29]:
# Define models to evaluate
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "SVM": SVC(probability=True, random_state=42)
}

In [30]:
# Train and evaluate each model
results = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    trained_model, accuracy = train_evaluate_model(name, model, X_train, y_train, X_test, y_test)
    results[name] = accuracy
    trained_models[name] = trained_model


Training Random Forest...


KeyboardInterrupt: 

In [None]:
# Find best model
best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name} with accuracy {results[best_model_name]:.4f}")


Best model: XGBoost with accuracy 0.5443


In [None]:
# Hyperparameter tuning for best model
print("\nPerforming hyperparameter tuning on best model...")


Performing hyperparameter tuning on best model...


In [None]:
if best_model_name == "Random Forest":
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
elif best_model_name == "Gradient Boosting":
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
elif best_model_name == "XGBoost":
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    }
else:  # SVM
    param_grid = {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__gamma': ['scale', 'auto', 0.1, 0.01],
        'classifier__kernel': ['rbf', 'poly', 'sigmoid']
    }


In [31]:
# Create pipeline for grid search
best_model = models[best_model_name]
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', best_model)
])

In [33]:
# Features and target
X = df[optimal_features]
y = df['decision_grouped']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


ValueError: 
All the 270 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 386, in _fit
    self._validate_steps()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 256, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't


In [None]:
# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate best model on test set
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
print(f"\nBest Model After Tuning:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{class_report}")
print(f"Confusion Matrix:\n{conf_matrix}")

In [None]:
# Plot final confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Reject', 'Waitlist', 'Accept'],
            yticklabels=['Reject', 'Waitlist', 'Accept'])
plt.title('Confusion Matrix - Best Tuned Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Evaluate feature importance for the final model
if hasattr(best_pipeline[-1], 'feature_importances_'):
    # Get feature names after preprocessing
    # This is a bit tricky due to OneHotEncoder
    feature_names = numeric_features.copy()
    # Add categorical feature names (with one-hot encoding)
    for cat_feat in categorical_features:
        categories = X[cat_feat].unique()
        for category in categories:
            feature_names.append(f"{cat_feat}_{category}")
    
    # Get feature importances
    importances = best_pipeline[-1].feature_importances_
    # Adjust if importances length doesn't match feature_names length
    if len(importances) != len(feature_names):
        print("Note: Feature names may not match exactly due to preprocessing transformations.")
        feature_names = [f"Feature {i}" for i in range(len(importances))]
    
    # Sort features by importance
    indices = np.argsort(importances)[::-1]
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.title("Feature Importances")
    plt.bar(range(len(importances)), importances[indices], align="center")
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
    
    # Print top 10 features
    print("\nTop 10 Important Features:")
    for i in range(min(10, len(importances))):
        print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")