# $Baseline$ $Model$ $Training$

In [19]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [2]:
try:
    df = pd.read_csv('../data/Processed_CollegePlacement.csv')
except FileNotFoundError:
    print('File Not Found! Please check file path and try again!')

In [3]:
x = df.drop(columns=['Placement','College_ID']).copy()
y = df['Placement'].copy()

In [4]:
y.value_counts()

Placement
0    8341
1    1659
Name: count, dtype: int64

$Insight$ ~ `Class samples are imbalanced`

In [5]:
# train - test split
x_train,x_test,y_train,y_test = train_test_split(
    x, y, test_size=0.3, stratify=y
)

x_train.to_csv('../data/x_train.csv',index=False)
x_test.to_csv('../data/x_test.csv',index=False)
y_train.to_csv('../data/y_train.csv',index=False)
y_test.to_csv('../data/y_test.csv',index=False)

In [25]:
# preprocessing and feature scaling
preprocessor = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(),x_train.columns),
    ('poly',PolynomialFeatures(include_bias=False),x_train.columns)
],verbose_feature_names_out=False)

In [None]:
# baseline models
name_models = {
    'Logistic_Regression' : LogisticRegression(penalty='l1',class_weight='balanced',random_state=42,max_iter=1000,solver='saga'),
    'Decision_Tree' : DecisionTreeClassifier(random_state=537,class_weight='balanced'),
    'Random_Forest' : RandomForestClassifier(random_state=537,class_weight='balanced'),
    'XGBoost' : XGBClassifier(objective='binary:logistic',random_state=42)
}

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=537)

results = {}
for name, models in name_models.items():
    print(f'Training {name}.This May Take A While...')
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('classifier',models)
    ])

    cv_score = cross_val_score(pipe, x_train, y_train, cv = cv, scoring='accuracy', n_jobs=-1,verbose=1)

    results[name] = {
        'CV accuracy across folds' : cv_score,
        'CV mean score' : cv_score.mean(),
        'CV std score' : cv_score.std()
    }
    print('-'*50)


for name, result in results.items():
    print(f'Model : {name}')
    print('cv across folds : ',result['CV accuracy across folds'])
    print('cv mean : ',result['CV mean score'])
    print('cv std : ',result['CV std score'])
    print('-'*50)

Training Logistic_Regression.This May Take A While...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training Decision_Tree.This May Take A While...
--------------------------------------------------
Training Random_Forest.This May Take A While...


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.0s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


--------------------------------------------------
Training XGBoost.This May Take A While...
--------------------------------------------------
Model : Logistic_Regression
cv across folds :  [0.85       0.86       0.84928571 0.84714286 0.85857143]
cv mean :  0.853
cv std :  0.005237229365663794
--------------------------------------------------
Model : Decision_Tree
cv across folds :  [1.         1.         1.         0.99928571 1.        ]
cv mean :  0.9998571428571429
cv std :  0.0002857142857142669
--------------------------------------------------
Model : Random_Forest
cv across folds :  [0.99928571 1.         0.99928571 0.99785714 0.99928571]
cv mean :  0.9991428571428571
cv std :  0.0006998542122237599
--------------------------------------------------
Model : XGBoost
cv across folds :  [1. 1. 1. 1. 1.]
cv mean :  1.0
cv std :  0.0
--------------------------------------------------


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [17]:
# best model
best_model = max(results,key=lambda name : results[name]['CV mean score'])
print(best_model)

XGBoost


In [26]:
# Tuning the best model (XGBoost)
xgb_pipe = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',XGBClassifier(objective='binary:logistic', random_state = 537))
])

negative_class = (y_train == 0).sum()
positive_class = (y_train == 1).sum()
scale_pos_weight  = negative_class / positive_class


xgb_params_grid = {
    'classifier__n_estimators' : [80,100,120],
    'classifier__max_depth' : [2,4,6],
    'preprocessor__poly__degree' : [1,2],
    'classifier__learning_rate' : [0.01,0.05,0.1],
    'classifier__scale_pos_weight' : [scale_pos_weight * 0.8,
                                    scale_pos_weight,
                                    scale_pos_weight * 1.2]
}

model = GridSearchCV(estimator=xgb_pipe, param_grid=xgb_params_grid, scoring= 'accuracy', refit= True, n_jobs = -1,cv=cv)
model.fit(x_train,y_train)

print('Best Score : ',model.best_score_)
print('Best Params : ',model.best_params_)

Best Score :  0.9998571428571429
Best Params :  {'classifier__learning_rate': 0.05, 'classifier__max_depth': 4, 'classifier__n_estimators': 100, 'classifier__scale_pos_weight': np.float64(4.023428079242033), 'preprocessor__poly__degree': 2}


In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,precision_score,recall_score,confusion_matrix,accuracy_score

In [30]:
best_model_ = model.best_estimator_

In [None]:
# prediction
y_probs = best_model_.predict_proba(x_test)
threshold = 0.5
y_pred = (y_probs >= threshold).astype(int)

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], shape=(3000, 2))

In [None]:
y_pred = best_model.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)


<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x208b8e26b10>