In [17]:
#importing data 
import pandas as pd
import numpy as np

#preprocessing 
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler 

#modeling 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, plot_roc_curve, roc_auc_score, auc
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from pandas.plotting import scatter_matrix

#pipeline 
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV

In [18]:
x_train_data = pd.read_csv('/content/X_train.csv')
y_train_data = pd.read_csv('/content/y_train.csv')

join = (x_train_data, y_train_data['target'])
df = pd.concat(join, axis = True)

df.rename(columns= {"city": "City", "city_development_index":"City_development_index", "enrollee_id": "Enrollee_id", "gender": "Gender", "relevent_experience" : "Relevent_experience", "enrolled_university":"University_enrollment", "major_discipline": "Discipline", "education_level": "Education_level", "experience": "Experience", "company_size": "Company_size", "company_type":"Company_type", "last_new_job":"Last_job", "training_hours":"Training_hours"}, inplace=True)
df = df.set_index('Enrollee_id')

In [3]:
df = df.drop(['Company_size', 'Company_type', 'City','City_development_index' ], axis = 1)
df.drop_duplicates(keep = False, inplace = True)
df["Gender"] = df["Gender"].fillna("Unknown")
df["Education_level"]= df["Education_level"].fillna("Unknown")
df["Discipline"].fillna(value="Unknown", inplace=True)
df["University_enrollment"] = df["University_enrollment"].fillna('Unknown')
df["Last_job"] = df["Last_job"].fillna('Unknown') 
df["Experience"] = df["Experience"].fillna(0)

cat_cols = df.select_dtypes(['object']).columns

for i in cat_cols:
    temp = pd.get_dummies(df[i], drop_first=True)
    df = pd.concat([df, temp], axis=1)
    df.drop(i, inplace=True, axis=1)

In [4]:
X_= df.drop('target', axis = 1)
X = np.array(X_)
y = df['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8864, 47)
(8864,)
(2217, 47)
(2217,)


In [31]:
pipe_lr = Pipeline([
                          ('smote', SMOTE()),
                          ('standardscaler', StandardScaler()),
                          ('pca', PCA()),
                          ('logisticregression', LogisticRegression(max_iter=3000))
])

param_grid_lr = {
    "smote__k_neighbors": [10, 12, 14],
    "pca__svd_solver": ['full'],
    "logisticregression__C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
    "logisticregression__random_state": [42, 54, 100]
}

In [9]:
pipe_knn = Pipeline([
                          ('smote', SMOTE()),
                          ('standardscaler', StandardScaler()),
                          ('pca', PCA()),
                          ('knn', KNeighborsClassifier())
])

param_grid_knn = {'knn__n_neighbors': [5, 25], 
                  'knn__weights': ['uniform', 'distance'],
                  'knn__metric':["euclidean","manhattan"], 
                  "smote__k_neighbors": [10, 12, 14],
                  "pca__svd_solver": ['full'],
}

In [19]:
pipe = pipe_knn
param_grid = param_grid_knn

inner_cv_folds = 5
outer_cv_folds = 4

inner_cv = StratifiedKFold(n_splits = inner_cv_folds)
outer_cv = StratifiedKFold(n_splits = outer_cv_folds)

gscv = HalvingGridSearchCV(
    pipe,
    param_grid = param_grid,
    cv = inner_cv,
    scoring = 'accuracy'
)

In [20]:
cvs = cross_val_score(
    gscv,
    X = X,
    y = y,
    scoring = 'accuracy',
    cv = outer_cv
)

print(cvs)
print(cvs.mean())
print(cvs.std())

[0.70371707 0.70794224 0.70180505 0.67400722]
0.6968678955713312
0.013384116185096395


In [21]:
gscv.fit(X, y)
print(gscv.best_estimator_)
print(gscv.best_params_)

Pipeline(steps=[('smote', SMOTE(k_neighbors=12)),
                ('standardscaler', StandardScaler()),
                ('pca', PCA(svd_solver='full')),
                ('knn',
                 KNeighborsClassifier(metric='manhattan', n_neighbors=25))])
{'knn__metric': 'manhattan', 'knn__n_neighbors': 25, 'knn__weights': 'uniform', 'pca__svd_solver': 'full', 'smote__k_neighbors': 12}


In [22]:
pred = gscv.predict_proba(X_test)
print(pred)

[[0.76 0.24]
 [0.4  0.6 ]
 [0.48 0.52]
 ...
 [0.64 0.36]
 [0.72 0.28]
 [0.56 0.44]]
