<a href="https://colab.research.google.com/github/Geerford/data-science-ipynb/blob/master/Classification/DSP_Labs_Antispoofing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Init

Подключим необходимые библиотеки

In [None]:
import glob
import os
import pickle
from multiprocessing import Pool

import cv2
import numpy as np
from skimage.feature import local_binary_pattern as LBP
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

#Extract feature

Для работы с изображениями, найдем их гистограммы — характеристики распределения интенсивности изображения

In [None]:
TRAIN_FILE_ID='1xmALNsYW-zm-Id9T6Ku2H4_cBWoqn23s' 
TRAIN_FILE_NAME='IDRND_FASDB_train.tar.gz' 
TEST_FILE_ID='1teaLCo-F-5RCQM8Puxqbk13bI0evqIzg' 
TEST_FILE_NAME='test.tar.gz'
TRAIN_PATH = 'IDRND_FASDB_train'
TEST_PATH = 'test'
NEW_WIDTH = 480
NEW_HEIGHT = 480
DOWNLOAD = False
EXTRACT = False
TUNING = False
STATE = 42

In [None]:
def extract_feature_vector(image, p=18, r=2):
    channels = list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))) + \
                list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)))
    lbp_features = [LBP(ch, p, r, method="uniform") for ch in channels]
    hist_features = [np.histogram(lf,  bins=p+1, normed=True)[0] for lf in lbp_features]
    return np.hstack([hf.ravel() for hf in hist_features])


def process_single(file):
    image = cv2.imread(file)
    h, w, c = image.shape
    dw, dh = w // 4, h // 4
    return extract_feature_vector(cv2.resize(image[dh:h-dh, dw:w-dw, :], (NEW_WIDTH, NEW_HEIGHT)))


def extract_features(filelist):
    def __impl(files):
        pool = Pool(32) 
        feats = list(tqdm_notebook(pool.imap(process_single, files), total=len(files)))        
        return feats
    feature_list = __impl(filelist) 
    return [x for x in feature_list if x is not None]

Скачиваем и распаковываем тренировочный и тестовый датасет

In [None]:
if DOWNLOAD:
  !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$TRAIN_FILE_ID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$TRAIN_FILE_ID" -O $TRAIN_FILE_NAME && rm -rf /tmp/cookies.txt
  !tar -xvzf 'IDRND_FASDB_train.tar.gz'
  
  !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$TEST_FILE_ID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$TEST_FILE_ID" -O $TEST_FILE_NAME && rm -rf /tmp/cookies.txt
  !tar -xvzf 'test.tar.gz'

Получаем гисторграммы изображений

In [None]:
if EXTRACT:
  train_real_filelist = glob.glob(os.path.join(TRAIN_PATH, 'real/*.png'))
  train_spoof_filelist = glob.glob(os.path.join(TRAIN_PATH, 'spoof/*.png'))
  test_filelist = glob.glob(os.path.join(TEST_PATH, '*.png'))

  train_real_features = extract_features(train_real_filelist)
  train_spoof_features = extract_features(train_spoof_filelist)
  test_features = extract_features(test_filelist)

  train_data = train_real_features + train_spoof_features
  train_labels = [0] * len(train_real_features) + [1] * len(train_spoof_features)


#Pickle

Сериализируем преобразованные объекты или загружаем сохраненные объекты

In [None]:
if EXTRACT:
  with open('train_data.pkl', 'wb') as f:
    pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL)
  with open('train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f, pickle.HIGHEST_PROTOCOL)

  with open('test_data.pkl', 'wb') as f:
    pickle.dump(test_features, f, pickle.HIGHEST_PROTOCOL)
  with open('test_filelist.pkl', 'wb') as f:
    pickle.dump(test_filelist, f, pickle.HIGHEST_PROTOCOL)
else:
  with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
  with open('train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
    
  with open('test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)
  with open('test_filelist.pkl', 'rb') as f:
    test_filelist = pickle.load(f)


#Dataset split

Разделение выборки на тренировочную и тестовую

In [None]:
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(train_data), train_labels, test_size=0.25, random_state=STATE)

In [None]:
cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=STATE)

#Tuning

In [None]:
def score_model(model, data, target):
  probs = model.predict_proba(data)
  predictions = model.predict(data)
  clf_report = classification_report(target, predictions)
  f_score = f1_score(target, predictions, average='macro')

  print('Predictions: {}'.format(predictions))
  print(clf_report)
  print('F1-score: {}'.format(f_score))

Подбор параметров для SVC

In [None]:
if TUNING:
  parameters = {'kernel':('linear', 'rbf'), 
                'C': [1, 10, 100, 1000],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'auto'],
                'decision_function_shape':('ovo','ovr'),
                'shrinking':(True,False),
                'probability': [True]}

  grid_svc = GridSearchCV(SVC(), 
                          parameters, 
                          refit = True, 
                          verbose = 3) 
  grid_svc.fit(trainData, trainLabels)

  best_svc_params = grid_svc.best_params_
  best_svc_estimator = grid_svc.best_estimator_
  print(best_svc_params)
  print(best_svc_estimator)


Подбор параметров для LogisticRegression

In [None]:
if TUNING:
  parameters = [{'penalty': ['l1','l2'], 
                'C': [0.001,0.01,0.1,1,10,100,1000],
                'solver': ['liblinear']}, 
                {'penalty': ['l2'], 
                  'C': [0.001,0.01,0.1,1,10,100,1000],
                  'solver': ['lbfgs']}]
  grid_lr = GridSearchCV(LogisticRegression(max_iter=10000), 
                        parameters)
  grid_lr.fit(trainData, trainLabels)
  best_lr_params = grid_lr.best_params_
  best_lr_estimator = grid_lr.best_estimator_
  print(best_lr_params) 
  print(best_lr_estimator) 

Подбор параметров для KNeighborsClassifier

In [None]:
if TUNING:
  parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7], 
                'p': [1, 2]}
  grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), 
                          param_grid=params_KNN, 
                          cv=cv_method, 
                          verbose=3, 
                          scoring='accuracy', 
                          return_train_score=True)
  grid_knn.fit(trainData, trainLabels)

  best_knn_params = grid_knn.best_params_
  best_knn_estimator = grid_knn.best_estimator_
  print(best_knn_params)
  print(best_knn_estimator)

Подбор параметров для GaussianNB

In [None]:
if TUNING:
  parameters = {'var_smoothing': np.logspace(0,-9, num=500)}

  grid_nb = GridSearchCV(estimator=GaussianNB(), 
                        param_grid=params_NB, 
                        cv=cv_method,
                        verbose=5, 
                        scoring='accuracy')
  grid_nb.fit(trainData, trainLabels)

  best_nb_params = grid_nb.best_params_
  best_nb_estimator = grid_nb.best_estimator_
  print(best_nb_params)
  print(best_nb_estimator)

Подбор параметров для DecisionTreeClassifier

In [None]:
if TUNING:
  parameters = {'criterion': ['gini', 'entropy'],
                'splitter': ['best', 'random'],
                'min_samples_leaf': range(1, 30, 5),
                'min_samples_split': range(2, 30, 5),
                'max_depth': range(1, 30)}

  grid_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=STATE), 
                        param_grid=parameters, 
                        cv=cv_method,
                        verbose=5, 
                        scoring='accuracy')
  grid_dt.fit(trainData, trainLabels)

  best_dt_params = grid_dt.best_params_
  best_dt_estimator = grid_dt.best_estimator_
  print(best_dt_params)
  print(best_dt_estimator)

Подбор параметров для RandomForestClassifier

In [None]:
if TUNING:
  max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
  max_depth.append(None)
  parameters = {'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
                'max_features': ['auto', 'sqrt'],
                'max_depth': max_depth,
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'bootstrap': [True, False]}
  grid_rf = RandomizedSearchCV(estimator=RandomForestClassifier(), 
                              param_distributions=parameters, 
                              n_iter=100, 
                              cv=cv_method,
                              verbose=2, 
                              random_state=42, 
                              n_jobs=14)
  grid_rf.fit(trainData, trainLabels)
  best_rf_params = grid_rf.best_params_
  best_rf_estimator = grid_rf.best_estimator_
  print(best_rf_params)
  print(best_rf_estimator)

#Models

## SVC

In [None]:
svc_model = SVC(C=1000, 
                decision_function_shape='ovo', 
                gamma=1, 
                kernel='rbf', 
                probability=True, 
                shrinking=True)
svc_model.fit(trainData, trainLabels)
svc_model.score(testData, testLabels)

0.9884337349397591

In [None]:
score_model(svc_model, testData, testLabels)

Predictions: [1 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       314
           1       0.99      1.00      0.99      1761

    accuracy                           0.99      2075
   macro avg       0.98      0.97      0.98      2075
weighted avg       0.99      0.99      0.99      2075

F1-score: 0.9772457278625606


## Logistic Regression

In [None]:
lr_model = LogisticRegression(C=100, 
                              penalty='l1', 
                              solver='liblinear', 
                              max_iter=10000)
lr_model.fit(trainData, trainLabels)
lr_model.score(testData, testLabels)

0.971566265060241

In [None]:
score_model(lr_model, testData, testLabels)

Predictions: [1 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       314
           1       0.98      0.98      0.98      1761

    accuracy                           0.97      2075
   macro avg       0.95      0.94      0.94      2075
weighted avg       0.97      0.97      0.97      2075

F1-score: 0.944284426353728


##KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1, p=1)
knn_model.fit(trainData, trainLabels)
knn_model.score(testData, testLabels)

0.984578313253012

In [None]:
score_model(knn_model, testData, testLabels)

Predictions: [1 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       314
           1       0.99      0.99      0.99      1761

    accuracy                           0.98      2075
   macro avg       0.97      0.98      0.97      2075
weighted avg       0.98      0.98      0.98      2075

F1-score: 0.9702895515870033


##Desition Tree

In [None]:
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_split=2, min_samples_leaf=1, splitter='best', random_state=STATE)
dt_model.fit(trainData, trainLabels)
dt_model.score(testData, testLabels)

0.9224096385542169

In [None]:
score_model(dt_model, testData, testLabels)

Predictions: [1 0 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       314
           1       0.95      0.96      0.95      1761

    accuracy                           0.92      2075
   macro avg       0.85      0.84      0.85      2075
weighted avg       0.92      0.92      0.92      2075

F1-score: 0.8467448153804302


## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False)
rf_model.fit(trainData, trainLabels)
rf_model.score(testData, testLabels)

0.9730120481927711

In [None]:
score_model(rf_model, testData, testLabels)

Predictions: [1 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.99      0.83      0.90       314
           1       0.97      1.00      0.98      1761

    accuracy                           0.97      2075
   macro avg       0.98      0.91      0.94      2075
weighted avg       0.97      0.97      0.97      2075

F1-score: 0.9437183476249336


#Ensemble

##Stacking

In [None]:
estimators = [('Support Vector Classification', svc_model), 
              ('Logistic Regression', lr_model), 
              ('KNN', knn_model),
              ('Decision Tree', dt_model), 
              ('Random Forest', rf_model)]
classifier = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier())
classifier.fit(trainData, trainLabels)
classifier.score(testData, testLabels)

0.9918072289156626

In [None]:
score_model(classifier, testData, testLabels)

Predictions: [1 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       314
           1       1.00      0.99      1.00      1761

    accuracy                           0.99      2075
   macro avg       0.98      0.98      0.98      2075
weighted avg       0.99      0.99      0.99      2075

F1-score: 0.9840723995793578


#Test

In [None]:
probs = classifier.predict_proba(test_data)
real_scores = probs[:, 0]

In [None]:
with open('results.txt', 'w') as f:
    for filepath, score in zip(test_filelist, scores):
        f.write('{},{}\n'.format(filepath.split('/')[-1], score))