In [1]:
import numpy as np
import pandas as pd

# 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
# load numpy array
data = np.load('data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [3]:
data.allow_pickle = True

In [4]:
X = data['arr_0'] # pca data with 50 components
y = data['arr_1'] # target or dependent variable

In [5]:
X.shape,y.shape

((6540, 50), (6540,))

In [6]:
X

array([[ 1.57580079, -0.50402487,  0.11164958, ..., -0.75585663,
         0.99301705,  0.1133236 ],
       [-0.0702743 , -1.93651129,  0.65026782, ..., -1.28672628,
         0.12173058, -0.2628243 ],
       [-0.25288082, -0.94052767, -0.84793568, ..., -0.44701399,
         0.09866262, -0.16176726],
       ...,
       [ 0.15987077,  0.27575875, -1.97267044, ...,  1.05880457,
         0.5206037 ,  0.35181657],
       [-1.62380426,  0.92051555, -2.85796138, ...,  0.72989619,
         2.02006675,  0.70342068],
       [ 0.93899253,  0.85546166, -0.96344381, ...,  1.38846891,
        -0.86489166, -0.25963655]])

In [7]:
y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      dtype=object)

### split the data into train and test

In [8]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(5232, 50) (1308, 50) (5232,) (1308,)


### training machine learning model

In [9]:
model_svc = SVC(probability=True)

param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [10]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2)

In [None]:
from sklearn import set_config
set_config(display='text')

model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   1.5s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   1.5s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.2s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.2s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.1s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   1.3s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   1.3s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   1.3s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   0.9s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [None]:
model_grid.best_params_

In [None]:
model_final = model_grid.best_estimator_

In [None]:
model_final.get_params()

### Model Evaluation
- Classification Report
    - Precision, Recall, F1-Score
- Kappa Score
    - -ve (worst model)
    - 0 to 0.5 (bad model)
    - 0.5 to 0.7 (Good Model)
    - 0.7 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)
- AUC
    - Less than 0.5 (Worst Model)
    - 0.5 to 0.6 (Bad Model)
    - 0.6 to 0.8 (Good Model)
    - 0.8 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)

In [None]:
y_pred = model_final.predict(x_test) # predicted values

In [None]:
y_pred

**Classification Report**

In [None]:
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

**Kappa Score**

In [None]:
metrics.cohen_kappa_score(y_test,y_pred)

**Area Under Curve (AUC)**

In [None]:
metrics.roc_auc_score(np.where(y_test=="male",1,0),
                      np.where(y_pred=="male",1,0))

#### Save Face Recognition Model

In [None]:
import pickle

In [None]:
pickle.dump(model_final,open('model/model_svm.pickle',mode='wb'))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cv_results = pd.DataFrame(model_grid.cv_results_)

In [None]:
param = ['param_C','param_coef0','param_gamma','param_kernel']
cv_results = cv_results[param+['mean_test_score']]
cv_results[param] = cv_results[param].astype(str)

In [None]:
plt.figure(figsize=(10,10))
for i, par in enumerate(param):
    plt.subplot(2,2,i+1)
    sns.kdeplot(data=cv_results,x='mean_test_score',hue=par)

In [None]:
model_grid.best_params_