In [17]:
# Import các thư viện cần thiết
import numpy as np
import cv2
import pandas as pd
import os
import pickle
from google.colab import drive
from google.colab.patches import cv2_imshow
from sklearn.model_selection import train_test_split

In [18]:
# Connect Google Drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/test_opencv/'
# Hiển thị tất cả thư mục hoặc file
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['32_training_data.ipynb',
 'image_2260.jpg',
 'image_2400.jpg',
 'People.jpg',
 'models',
 '27.ipynb',
 '30.ipynb',
 'FaceImage',
 'data_face_features.pickle',
 '31_data_preprocessing.ipynb',
 '29.ipynb']

In [19]:
# 1. DATA
# Load data from pickle file
data = pickle.load(open(path + "data_face_features.pickle", mode="rb"))

In [20]:
x = np.array(data['data'])
y = np.array(data['label'])

print(x)
print(y)

[[[ 0.08169983  0.22247793  0.08332013 ... -0.01693342  0.17324772
   -0.00759792]]

 [[ 0.08038288  0.11583587  0.01226715 ...  0.01396423  0.0439516
   -0.01147527]]

 [[ 0.05002554 -0.02660235 -0.03025214 ...  0.16576183  0.03893316
    0.05657152]]

 ...

 [[ 0.10969666  0.07481515 -0.02033143 ... -0.04903653  0.09105934
    0.09910018]]

 [[ 0.07593021  0.0680669  -0.0709245  ...  0.01875338  0.05717097
    0.03867698]]

 [[ 0.1254019   0.1268812  -0.13584706 ... -0.07767591  0.0461805
   -0.04827639]]]
['1721031620_PhanQuocHuy' '1721031620_PhanQuocHuy'
 '1721031620_PhanQuocHuy' ... '162000309_LeHoangQuan'
 '162000309_LeHoangQuan' '162000309_LeHoangQuan']


In [21]:
# Hiển thị kích thước của mảng
print(x.shape)
print(y.shape)

(4706, 1, 128)
(4706,)


In [22]:
# Trả về mảng 2 chiều với 128 cột
x = x.reshape(-1, 128)
print(x.shape)

(4706, 128)


In [23]:
# chia nhỏ dữ liệu: 80% huấn luyện, 20% kiểm tra
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3764, 128)
(942, 128)
(3764,)
(942,)


In [24]:
# 2. Train machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [25]:
def get_report(model, x_train, y_train, x_test, y_test):
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)

  # accuracy score
  acc_train = accuracy_score(y_train, y_pred_train)
  acc_test = accuracy_score(y_test, y_pred_test)

  # f1 score
  f1_score_train = f1_score(y_train, y_pred_train, average="macro")
  f1_score_test = f1_score(y_test, y_pred_test, average="macro")

  # print
  print('Accurency Train = %0.2f'%acc_train)
  print('Accurency Test = %0.2f'%acc_test)
  print('F1 Score Train = %0.2f'%f1_score_train)
  print('F1 Score Test = %0.2f'%f1_score_test)

In [26]:
# Logistic regression
model_logistic = LogisticRegression()
model_logistic.fit(x_train, y_train) # training
get_report(model_logistic, x_train, y_train, x_test, y_test)

Accurency Train = 0.72
Accurency Test = 0.70
F1 Score Train = 0.51
F1 Score Test = 0.49


In [27]:
# Support Vector Machines
model_svc = SVC(probability=True)
model_svc.fit(x_train, y_train)
get_report(model_svc, x_train, y_train, x_test, y_test)

Accurency Train = 0.91
Accurency Test = 0.83
F1 Score Train = 0.83
F1 Score Test = 0.72


In [28]:
# Random forest
model_rf = RandomForestClassifier(n_estimators=20,)
model_rf.fit(x_train, y_train)
get_report(model_rf, x_train, y_train, x_test, y_test)

Accurency Train = 1.00
Accurency Test = 0.75
F1 Score Train = 1.00
F1 Score Test = 0.64


In [29]:
# Voting Classifier
model_voting = VotingClassifier(estimators=[
    ('logistic', LogisticRegression()),
    ('svm', SVC(probability=True)),
    ('rf', RandomForestClassifier(n_estimators=20,)),
], voting='soft', weights=[2,3,1])
model_voting.fit(x_train, y_train)
get_report(model_voting, x_train, y_train, x_test, y_test)

Accurency Train = 0.98
Accurency Test = 0.82
F1 Score Train = 0.97
F1 Score Test = 0.71


In [30]:
# 3. Parameter Tuning
from sklearn.model_selection import GridSearchCV

model_grid = GridSearchCV(model_voting, param_grid={
    'svm__C': [3,5,7,10],
    'svm__gamma': [0.1, 0.3, 0.5],
    'rf__n_estimators': [5, 10,20],
    'rf__max_depth': [3, 5, 7],
    'voting': ['soft', 'hard']
}, scoring='accuracy', cv=3, n_jobs=1, verbose=2)
model_grid.fit(x_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   6.0s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   5.3s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   5.2s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   6.4s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   4.4s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   5.0s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   5.9s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   4.7s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   5.6s
[

In [31]:
model_best_estimator = model_grid.best_estimator_
model_grid.best_score_

0.8445808472648482

In [32]:
# 4. Save Model
pickle.dump(model_best_estimator, open(path + "/ml_face_person_identity.pkl", mode='wb'))