In [26]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC 
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [28]:
hr = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\human-resources-analytics\HR_comma_sep.csv")
hr.head(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low


In [30]:
X = hr.drop('left', axis=1)
y = hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

In [32]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')

### Linear Kernel

In [35]:
svm = SVC(kernel='linear',probability=True, random_state=24)
pipe = Pipeline([('CT',ct), ('SCL', scaler_mm), ('SVM',svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7743943098466326


In [39]:
y_pred_prob = pipe.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.799910875626528


### Grid Search CV

In [47]:
svm = SVC(kernel='linear',probability=True, random_state=24)
pipe = Pipeline([('CT',ct), ('SCL', scaler_mm), ('SVM',svm)])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'SVM__C': np.linspace(0.001, 5, 3)}
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='roc_auc', verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ......................SVM__C=0.001;, score=0.803 total time=  11.7s
[CV 2/5] END ......................SVM__C=0.001;, score=0.793 total time=  11.5s
[CV 3/5] END ......................SVM__C=0.001;, score=0.802 total time=  12.0s
[CV 4/5] END ......................SVM__C=0.001;, score=0.796 total time=  11.8s
[CV 5/5] END ......................SVM__C=0.001;, score=0.813 total time=  11.5s
[CV 1/5] END .........SVM__C=2.5004999999999997;, score=0.807 total time=  13.4s
[CV 2/5] END .........SVM__C=2.5004999999999997;, score=0.801 total time=  13.3s
[CV 3/5] END .........SVM__C=2.5004999999999997;, score=0.808 total time=  14.1s
[CV 4/5] END .........SVM__C=2.5004999999999997;, score=0.802 total time=  13.3s
[CV 5/5] END .........SVM__C=2.5004999999999997;, score=0.818 total time=  13.3s
[CV 1/5] END ........................SVM__C=5.0;, score=0.808 total time=  14.8s
[CV 2/5] END ........................SVM__C=5.0;,

In [48]:
print(gcv.best_params_)
print(gcv.best_score_)

{'SVM__C': 5.0}
0.8074614358403881


### Polynomial Kernel

In [51]:
svm = SVC(kernel='poly',probability=True, random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'SVM__C': np.linspace(0.001, 5, 3), 'SVM__degree':[2,3]}
pipe = Pipeline([('CT',ct), ('SCL', scaler_mm), ('SVM',svm)])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='roc_auc', verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END .......SVM__C=0.001, SVM__degree=2;, score=0.838 total time=  13.5s
[CV 2/5] END .......SVM__C=0.001, SVM__degree=2;, score=0.826 total time=  12.4s
[CV 3/5] END .......SVM__C=0.001, SVM__degree=2;, score=0.828 total time=  12.3s
[CV 4/5] END .......SVM__C=0.001, SVM__degree=2;, score=0.814 total time=  12.5s
[CV 5/5] END .......SVM__C=0.001, SVM__degree=2;, score=0.838 total time=  12.3s
[CV 1/5] END .......SVM__C=0.001, SVM__degree=3;, score=0.833 total time=  12.9s
[CV 2/5] END .......SVM__C=0.001, SVM__degree=3;, score=0.820 total time=  12.6s
[CV 3/5] END .......SVM__C=0.001, SVM__degree=3;, score=0.815 total time=  12.6s
[CV 4/5] END .......SVM__C=0.001, SVM__degree=3;, score=0.806 total time=  12.9s
[CV 5/5] END .......SVM__C=0.001, SVM__degree=3;, score=0.824 total time=  12.8s
[CV 1/5] END SVM__C=2.5004999999999997, SVM__degree=2;, score=0.950 total time=   7.9s
[CV 2/5] END SVM__C=2.5004999999999997, SVM

In [52]:
print(gcv.best_params_)
print(gcv.best_score_)

{'SVM__C': 5.0, 'SVM__degree': 3}
0.9675694324894775


### Radial Kernel

In [55]:
svm = SVC(kernel='rbf',probability=True, random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
pipe = Pipeline([('CT',ct), ('SCL', scaler_mm), ('SVM',svm)])
params = {'SVM__C': np.linspace(0.001, 5, 3), 'SVM__gamma': np.linspace(0.001, 5, 3)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='roc_auc', verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.767 total time=  16.7s
[CV 2/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.771 total time=  16.5s
[CV 3/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.762 total time=  16.8s
[CV 4/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.761 total time=  16.4s
[CV 5/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.792 total time=  16.5s
[CV 1/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.931 total time=  18.1s
[CV 2/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.931 total time=  18.2s
[CV 3/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.941 total time=  18.0s
[CV 4/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.928 total time=  18.2s
[CV 5/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.935 total time=  18.0s
[CV 1/5] END ......SVM__C=0.001, SVM__gamma=5.0;, score=0.936 total time=  19.2s
[CV 

In [59]:
print(gcv.best_params_)
print(gcv.best_score_)

{'SVM__C': 5.0, 'SVM__gamma': 5.0}
0.9814093756912985
