# **Load Data + Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import loadtxt
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score,confusion_matrix
import pickle
from tqdm import tqdm
import os
random_state = 42

In [2]:
#change path to your dataset path in this cell
fidf = np.load(".././3Dresnext101_2s_data_feature/fight.npy")
nofidf = np.load(".././3Dresnext101_2s_data_feature/nofight.npy") 

In [3]:
fidf.shape, nofidf.shape

((150, 4, 2048), (150, 4, 2048))

In [4]:
fidf_array = []
nofidf_array = []
for i in range(fidf.shape[0]):
  fidf_array.append(fidf[i].flatten())
  nofidf_array.append(nofidf[i].flatten())

fidf_array = np.array(fidf_array)
nofidf_array = np.array(nofidf_array)

In [5]:
fi_train, fi_test, nofi_train, nofi_test = train_test_split(fidf_array,nofidf_array,test_size=0.13,random_state=random_state)
x_train = np.vstack([fi_train,nofi_train])
y_train = np.hstack((np.full(len(fi_train),1),np.full(len(nofi_train),0)))
x_test = np.vstack([fi_test,nofi_test])
y_test = np.hstack([np.full(len(fi_test),1),np.full(len(nofi_test),0)])

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((260, 8192), (40, 8192), (260,), (40,))

In [7]:
x_test.shape

(40, 8192)

# **Running Model**

In [8]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def print_score(clf, X_train, y_train, X_test, y_test):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    '''
    training performance
    '''    
    print("Train Result:\n")
    print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
    
    print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))        
    print("Train Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))        

    res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
    print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    
    '''
    test performance
    '''
    print("Test Result:\n")        
    print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
    print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
    print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [9]:
from sklearn.metrics import classification_report, confusion_matrix
def get_score(clf, X_test, y_test):
        y_pred = clf.predict(X_test)
        report = classification_report(y_test,y_pred,output_dict=True)
        label_0 = report['0']
        label_1 = report['1']
        precision = np.array([label_0['precision'],label_1['precision']])
        recall = np.array([label_0['recall'],label_1['recall']])
        f1_score = np.array([label_0['f1-score'],label_1['f1-score']])
        cfmat = confusion_matrix(y_test,y_pred)
        return precision, recall, f1_score, cfmat

In [10]:
pre_data_types = ['smote','bootstrap']

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
def get_imbalance_data(fi_train_,nofi_train_,rate,balance_type=None):
    temp_train, _ = train_test_split(fi_train_,test_size=1-rate,random_state=random_state)
    if balance_type == 'bootstrap':
        temp_train = resample(temp_train,n_samples=len(nofi_train),random_state=random_state)
    x_train = np.vstack([temp_train,nofi_train_])
    y_train = np.hstack([np.full(len(temp_train),1),np.full(len(nofi_train_),0)])
    if balance_type == 'smote':
        smote = SMOTE(random_state=42)
        x_train, y_train = smote.fit_resample(x_train,y_train)
    # print(temp_train.shape,nofi_train.shape,x_train.shape,y_train.shape)
    return x_train, y_train

In [12]:
def model_run_data_imbalance(model,from_rate, to_rate = 1,sep = 0.11,save_folder='./',balance_type=None):
    assert from_rate is not None, 'from_rate must have value'
    columns=['no-fight','fight','rate','model']
    df_p  = pd.DataFrame(columns=columns)
    df_r  = pd.DataFrame(columns=columns)
    df_f1  = pd.DataFrame(columns=columns)
    cf_list = []
    for rate in np.arange(from_rate,to_rate,sep):
        rate = np.round(rate,2)
        x_tr, y_tr = None, None
        if np.isclose(rate,1):
            x_tr, y_tr = x_train, y_train
        elif rate < 1:
            x_tr, y_tr = get_imbalance_data(fi_train,nofi_train,rate,balance_type)
        else:
            print('Rate không được lớn hơn 1')
            break        
        x_te, y_te = x_test, y_test
        for key in model.keys():
            path = save_folder
            filename = str(rate).replace('.','')+'_'+key+'.sav'
            if balance_type in pre_data_types:
                filename = balance_type+'_'+filename
            path = path+filename
            clf = None
            if os.path.isfile(path) is False:
                clf = model[key]
                clf.fit(x_tr, y_tr)
                pickle.dump(clf, open(path, 'wb'))
            else:
                clf = pickle.load(open(path, 'rb'))
            # print(key+' Result:\n')
            # print_score(clf,x_tr,y_tr,x_te,y_te)
            p,r,f1, cfmat = get_score(clf,x_te,y_te)
            p = np.round(p,3)
            r = np.round(r,3)
            f1 = np.round(f1,3)
            cf_list.append(cfmat)
            df_p = df_p.append({columns[0]: p[0],columns[1]: p[1],columns[2]:rate,columns[3]:key},ignore_index=True)
            df_r = df_r.append({columns[0]: r[0],columns[1]: r[1],columns[2]:rate,columns[3]:key},ignore_index=True)
            df_f1 = df_f1.append({columns[0]: f1[0],columns[1]: f1[1],columns[2]:rate,columns[3]:key},ignore_index=True)
    return df_p, df_r, df_f1, cf_list

In [13]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# Add or delete model you want
model = {
    'bernoulliNB' : BernoulliNB(binarize=0.0),
    'svm_poly': SVC(kernel='poly',random_state=random_state),
    'decision_tree': DecisionTreeClassifier(criterion = "gini", max_depth = 10,random_state=random_state),
    'sgd_05': SGDClassifier(alpha = 0.5,random_state=random_state),
    'randomforest': RandomForestClassifier(n_estimators=200,max_depth=10,criterion='gini',random_state=random_state,n_jobs=-1),
    'balanced_bag': BalancedBaggingClassifier(random_state=random_state),
    'bag_id3': BaggingClassifier(DecisionTreeClassifier(random_state=random_state), max_samples=0.8, max_features=0.8,random_state=random_state),
    'adaboost': AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state), n_estimators=10, learning_rate=1, random_state=random_state),
    'xgboost': XGBClassifier(max_depth=20, n_estimators=1000, learning_rate=0.3, n_jobs=-1),
    'knn_n1': KNeighborsClassifier(n_neighbors=1),
    'knn_n17': KNeighborsClassifier(n_neighbors=17),
}

***

In [14]:
df_precision, df_recall, df_f1s, cfmatrix_list = model_run_data_imbalance(model,0.1,1.01,0.1,'./model_with_smote/','smote')

In [15]:
def save_scores(folder,dic_scores):
    for key in dic_scores.keys():
        dic_scores[key].to_csv(folder+key+'.csv')

In [16]:
dic_scores ={
    'precision_score':df_precision,
    'recall_score':df_recall,
    'f1-score':df_f1s
}

In [17]:
save_scores('./scores/smote_',dic_scores)

In [18]:
model_names = np.array(list(model.items()))[:,0].flatten()

In [19]:
np.save('model_names.npy',model_names)

***

In [17]:
#test
obj = {
    'knn_n17': KNeighborsClassifier(n_neighbors=17),
}
df_precision, df_recall, df_f1s, cfmatrix_list = model_run_data_imbalance(obj,0.1,0.2,0.1,'./model_with_smote/','smote')

In [18]:
cfmatrix_list

[array([[10, 10],
        [ 0, 20]], dtype=int64)]

***

Tìm K phù hợp cho KNN

In [24]:
xt_train, yt_train = get_imbalance_data(fi_train,nofi_train,0.1)

In [43]:
pre = []
rcl = []
f1s = []
for k in range(1,20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    cr = classification_report(y_test,knn.predict(x_test),output_dict=True)
    pre.append([cr['0']['precision'],cr['1']['precision']])
    rcl.append([cr['0']['recall'],cr['1']['recall']])
    f1s.append([cr['0']['f1-score'],cr['1']['f1-score']])
    print('0: ',pre[k-1][0], rcl[k-1][0],f1s[k-1][0])
    print('1: ',pre[k-1][1], rcl[k-1][1],f1s[k-1][1])

0:  0.875 0.7 0.7777777777777777
1:  0.75 0.9 0.8181818181818182
0:  0.68 0.85 0.7555555555555556
1:  0.8 0.6 0.6857142857142857
0:  0.8666666666666667 0.65 0.7428571428571429
1:  0.72 0.9 0.7999999999999999
0:  0.6956521739130435 0.8 0.7441860465116279
1:  0.7647058823529411 0.65 0.7027027027027027
0:  0.8125 0.65 0.7222222222222223
1:  0.7083333333333334 0.85 0.7727272727272727
0:  0.72 0.9 0.7999999999999999
1:  0.8666666666666667 0.65 0.7428571428571429
0:  0.8947368421052632 0.85 0.8717948717948718
1:  0.8571428571428571 0.9 0.8780487804878048
0:  0.782608695652174 0.9 0.8372093023255814
1:  0.8823529411764706 0.75 0.8108108108108107
0:  0.8888888888888888 0.8 0.8421052631578948
1:  0.8181818181818182 0.9 0.8571428571428572
0:  0.8571428571428571 0.9 0.8780487804878048
1:  0.8947368421052632 0.85 0.8717948717948718
0:  0.9444444444444444 0.85 0.8947368421052632
1:  0.8636363636363636 0.95 0.9047619047619048
0:  0.85 0.85 0.85
1:  0.85 0.85 0.85
0:  0.8947368421052632 0.85 0.871794

In [45]:
pre = np.array(pre)
rcl = np.array(rcl)
f1s = np.array(f1s)

In [46]:
np.argmax(pre[:,0]),np.argmax(pre[:,1])

(10, 17)

In [47]:
np.argmax(rcl[:,0]),np.argmax(rcl[:,1])

(17, 10)

In [48]:
np.argmax(f1s[:,0]),np.argmax(f1s[:,1])

(10, 10)