In [0]:
!pip install pyreadr
!pip install PyDrive

In [0]:
import numpy as np
import os
import pandas as pd
import time
import xgboost as xgb
import pyreadr
import scipy.io as scio
from collections import OrderedDict
from google.colab import auth
from oauth2client.client import GoogleCredentials 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from scipy.io import loadmat
from scipy.spatial.distance import cdist
from sklearn import datasets
from sklearn import metrics 
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from time import time

# ADVANCED MODEL


# Instruction
1. Upload training data file in the google drive
2. Get shareable link of the data file
3. Get file ID (the file ID can be obstained from the link.) 
4. replace the file ID in corresponding code. (Detailed instruction also come with the code throughout the file.)

##Part 0: set up control and work directories, extract paths.

In [0]:
####Authenticate the google drive account
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
from sklearn.model_selection import train_test_split, GridSearchCV #Perforing grid search 
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

## Part 1: Import Data


In [0]:
#####get the file shareable link = https://drive.google.com/open?id=1oliwM2-sH8CD_3q3yUbLF836U9A1-Lc0
#####The file ID is the letter after "id=".

In [0]:
#####please replace the id of your file
download = drive.CreateFile({'id': '1oliwM2-sH8CD_3q3yUbLF836U9A1-Lc0'}) 
download.GetContentFile('train_set.zip')
!unzip train_set.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: __MACOSX/train_set/points/._1901.mat  
  inflating: train_set/points/1929.mat  
  inflating: __MACOSX/train_set/points/._1929.mat  
  inflating: train_set/points/0389.mat  
  inflating: __MACOSX/train_set/points/._0389.mat  
  inflating: train_set/points/1097.mat  
  inflating: __MACOSX/train_set/points/._1097.mat  
  inflating: train_set/points/1083.mat  
  inflating: __MACOSX/train_set/points/._1083.mat  
  inflating: train_set/points/0410.mat  
  inflating: __MACOSX/train_set/points/._0410.mat  
  inflating: train_set/points/2207.mat  
  inflating: __MACOSX/train_set/points/._2207.mat  
  inflating: train_set/points/1068.mat  
  inflating: __MACOSX/train_set/points/._1068.mat  
  inflating: train_set/points/0376.mat  
  inflating: __MACOSX/train_set/points/._0376.mat  
  inflating: train_set/points/0362.mat  
  inflating: __MACOSX/train_set/points/._0362.mat  
  inflating: train_set/points/2213.mat  
  inf

In [0]:
#####Run the code, and go to the URL in th output, enter the authorization code, done
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# I. Our Advanced Model

We are using PCA with Bagging-SVM as our Advacned Model

Notation on These functions:
1. extract_mat(): 

  **TAKES IN** a list returned by a loadmat function. 

  **RETURN** an array that have all the points in the mat file.

2. get_f(): 

  **TAKES IN** a direction that contains a _single_ .mat file. 

  **RETURN** an ndarray contains the pairwise euclidian distance between the coordinate contains in the .mat file.

3. feature_extraction(): 

  **TAKES IN** a direction that contains the direction that contains _all_ the .mat file for the train. 
  
  **RETURNS** a ndarray contains the train_x with features set as pairwise euclidian distance between the coordinate contains in the .mat file.

4. f_pca(): 

  **TAKES IN** a ndarray contains all the train_x.

  **RETURNS** a ndarray contains decomposed x and the decompositon model.

5. BaggingSVM_w_pca(): 

  **TAKES IN** two ndarrays as train_x(without decomposition) and train_y. 
  
  **RETURNS** the SVM-Bagging model trained with decomposed-train_x and train_y.

6. claim_possible_acc_BSVM(): 

  **TAKES IN** three arguments which is the direction that contains _all_the .mat file for x, the direction that contains a file named *label.csv* that have a column named as emotion_idx as the train_y. 
  
  **RETURNS** the possible accuracy of the Logistic-Bagging model.

In [0]:
def extract_mat(x):
    v = list(x.keys())[-1]
    return x[v]

def get_f(file_dir):
    '''Argument: 
        file_dir: The whole direction contain the exact mat file
       
       Return:
        a np.array contains the featrues of single X'''
    a = extract_mat(loadmat(file_dir))
    b = cdist(a, a)
    r = b[np.triu_indices(b.shape[1], 1)].flatten()
    return r

def f_pca(x):
    my_pca = PCA(n_components = 130)
    new_X = my_pca.fit_transform(x)
    compo = sum(my_pca.explained_variance_ratio_)
    print(f'The Decomposition take up {compo: 0.2f}% Information of original Data')
    
    return new_X, my_pca

def feature_extraction(dir_x):
    if (dir_x[-1] != '/'):
        dir_x = dir_x + '/'
    
    fea_start = time.time()
    
    filenames = list(os.listdir(dir_x))
    filenames.sort()
    X = np.array(list(map(get_f, ((dir_x + i) for i in filenames))))
    
    fea_end = time.time()
    fea_time = fea_end - fea_start
    
    print('Feature Extraction Completed!')
    print(f'Feature Extraction Cost: {fea_time: 0.2f} Seconds')
    return X

def BaggingSVM_w_pca(train_X, train_y):
    
    train_X, pca_mode = f_pca(train_X)    
    
    start_SVM = time.time()
    S_svm = SVC(C = 0.1,
                kernel = 'linear',
                shrinking = True,
                decision_function_shape = 'ovo')
    Bagg_SVM = BaggingClassifier(S_svm,
                                 n_estimators = 80,
                                 n_jobs = 5,
                                 bootstrap_features = True)
    Bagg_SVM.fit(train_X, train_y)
    end_SVM = time.time()
    
    Train_time = end_SVM - start_SVM
    print(f'The Time for train is: {Train_time: 0.2f} Seconds')
    return Bagg_SVM, pca_mode


def claim_possible_acc_BSVM(X_path, y_path, n_iter = 1):
    X = feature_extraction(X_path)
    y = pd.read_csv(y_path).emotion_idx
    
    accs = []
    for i in range(n_iter):
        trainx, testx, trainy, testy = train_test_split(X, y, test_size = .2)
        model, pca_mode= BaggingSVM_w_pca(trainx, trainy)
        new_testx = pca_mode.transform(testx)
        testy_hat = model.predict(new_testx)
        accs.append(accuracy_score(testy, testy_hat))
    return np.mean(accs)


In [0]:
# This line can output the Claimed Accuracy
# You don't really need run it
# claim_possible_acc_BSVM('train_set/points', 'train_set/label.csv', 5)

Feature Extraction Completed!
Feature Extraction Cost:  0.95 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  127.39 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  109.33 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  115.86 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  133.65 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  112.68 Seconds


0.5256000000000001

Our Advanced Model would have 52.56% Accuracy.

The train time for our Advanced Model is about 110 seconds

You can use The Code below to train the model on the whole train data set


In [0]:
X = feature_extraction('train_set/points')
y = pd.read_csv('train_set/label.csv').emotion_idx
advanced_model, pca_sub_model = BaggingSVM_w_pca(X, y)

Feature Extraction Completed!
Feature Extraction Cost:  0.82 Seconds
The Decomposition take up  1.00% Information of original Data
The Time for train is:  322.94 Seconds


Then, you can use the code below to test on the test set

In [0]:
# X_test = feature_extraction('''your test set direction''')
# X_test_decomp = pca_sub_model.transform(X_test)
# y_predict = advanced_model.predict(X_test_decomp)

# II. XGBOOST Model

## Part 0: Feature Extration and Train/Test Split

In [0]:
##### Importing the fidusial points 
import scipy.io as scio
from collections import OrderedDict 
points_path = 'train_set/points'
points = [p for p in sorted(os.listdir(points_path))]
all_points = []
for p in points:
  poiFile = os.path.join(points_path, p)
  poi = scio.loadmat(poiFile)
  poi = OrderedDict(poi)
  all_points.append(poi.popitem()[1])
y = pd.read_csv('train_set/label.csv')['emotion_idx']

print('success')


success


In [0]:
##### Calculating pairwise distance 
pair_dist = []
for i in range(len(all_points)):
  pair_dist.append(metrics.pairwise_distances(all_points[i])[np.triu_indices(78)])

##### Split train_set & test_set 
points_train, points_test, y_train, y_test = train_test_split(pair_dist, y, random_state=42, test_size=0.2)
print('success')

success


In [0]:
##### Feature Extration/Calculating pairwise distance and the time for feature extration 
import time

allpoints_train, allpoints_test, y_train, y_test = train_test_split(all_points, y, random_state=42, test_size=0.2)
print('success')


train_pair_dist = []
for i in range(len(allpoints_train)):
  pair_dist.append(metrics.pairwise_distances(allpoints_train[i])[np.triu_indices(78)])

test_pair_dist = []
for i in range(len(allpoints_test)):
  pair_dist.append(metrics.pairwise_distances(allpoints_test[i])[np.triu_indices(78)])


start = time.time()
pair_dist = []
for i in range(len(all_points)):
  pair_dist.append(metrics.pairwise_distances(all_points[i])[np.triu_indices(78)])
finish = time.time()
print("Time on feature selection done in %0.3fs" % (finish-start))

start = time.time()
train_pair_dist = []
for i in range(len(allpoints_train)):
  pair_dist.append(metrics.pairwise_distances(allpoints_train[i])[np.triu_indices(78)])
finish = time.time()
print("Time on feature selection training set done in %0.3fs" % (finish-start))

start = time.time()
test_pair_dist = []
for i in range(len(allpoints_test)):
  pair_dist.append(metrics.pairwise_distances(allpoints_test[i])[np.triu_indices(78)])
finish = time.time()
print("Time on feature selection test set done in %0.3fs" % (finish-start))


success
Time on feature selection done in 1.098s
Time on feature selection training set done in 0.922s
Time on feature selection test set done in 0.277s


## Part 1: XGBoost Training 


In [0]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
import time
import numpy as np

In [0]:
def modelfit(alg, dtrain, predictors, cv_folds=10):
  #Fit the algorithm on the data
  alg.fit(dtrain, predictors)

  #Predict training set:
  dtrain_predictions = alg.predict(dtrain)
  dtrain_predprob = alg.predict_proba(dtrain)[:,1]

  #Print model report:
  print("\nModel Report")
  print("Accuracy : %.4g" % metrics.accuracy_score(predictors, dtrain_predictions))

## Part 2:XGBoost Default setting

In [0]:
####XGBOOST base model with default setting
start = time.time()
xgb_base = XGBClassifier(
 objective= 'multi:softmax',
 num_class= 22,
 seed=1000)

modelfit(xgb_base, np.array(points_train), np.array(y_train))
finish = time.time()
print("Prediction on train_set done in %0.3fs" % (finish-start))


Model Report
Accuracy : 1
Prediction on train_set done in 807.874s


In [0]:
start = time.time()
preds = xgb_base.predict(points_test)
acc_preds = metrics.accuracy_score(preds, y_test)
finish = time.time()
print("Prediction on test_set done in %0.3fs" % (finish - start))
print("Test_set accurarcy is %0.3f" %acc_preds)

Prediction on test_set done in 0.740s
Test_set accurarcy is 0.482


### Tuning Process(comment it out because the process is time comsuming)

In [0]:
####tune hyperparameter

## tune the max_depth and min_child_weight parameter
# param_test1 = { 
#  'max_depth': range(4,5,6),
#  'min_child_weight': range(4,5,6)
# } 
# gsearch1 = GridSearchCV(xgb_base, param_grid = param_test1, scoring ='accuracy', cv = 5)
# gsearch1.fit(np.array(points_train), np.array(y_train))
# best_parameters1 = gsearch1.best_estimator_.get_params()
# for param_name in sorted(param_test1.keys()):
#     print("\t%s: %r" % (param_name, best_parameters1[param_name]))

## use the best_parameter above to xgb2
# start = time.time()
# xgb2 = XGBClassifier(
#  objective= 'multi:softmax',
#  num_class= 22,
#  max_depth=4,
#  min_child_weight=4,
#  seed=1000)
# modelfit(xgb2, np.array(points_train), np.array(y_train))
# finish = time.time()
# print("Prediction on train_set done in %0.3fs" % (finish-start))
# start = time.time()
# preds = xgb2.predict(points_test)
# acc_pred = metrics.accuracy_score(preds, y_test)
# finish = time.time()
# print("Prediction on test_set done in %0.3fs" % (finish - start))
# print("Test_set accurarcy is %0.3f" %acc_pred)

## However, the accuracy is lower than that of the base model, so we keep the same parameter 
## as before, and tune other parameters.

## tune the gamma parameter
# param_test2 = { 
#  'gamma':[i/10.0 for i in range(0,5)]
# } 
# gsearch2 = GridSearchCV(xgb1, param_grid = param_test2, scoring ='accuracy', cv = 5)
# gsearch2.fit(np.array(points_train), np.array(y_train))
# best_parameters2 = gsearch2.best_estimator_.get_params()
# for param_name in sorted(param_test2.keys()):
#     print("\t%s: %r" % (param_name, best_parameters2[param_name]))

# start = time.time()
# xgb3 = XGBClassifier(
#  objective= 'multi:softmax',
#  num_class= 22,
#  gamma=0.4,
#  seed=1000)

# modelfit(xgb3, np.array(points_train), np.array(y_train))
# finish = time.time()
# print("Prediction on train_set done in %0.3fs" % (finish-start))

# start = time.time()
# preds = xgb3.predict(points_test)
# acc_pred = metrics.accuracy_score(preds, y_test)
# finish = time.time()
# print("Prediction on test_set done in %0.3fs" % (finish - start))
# print("Test_set accurarcy is %0.3f" %acc_pred)

## However, the accuracy is lower than that of the base model, so we keep the same parameter 
## as before, and tune other parameters.

## tune the subsample and colsample_bytree parameters
#param_test = {
 #'subsample':[i/10.0 for i in range(6,10)],
 #'colsample_bytree':[i/10.0 for i in range(6,10)]
#}
#gsearch = GridSearchCV(xgb_base, param_grid = param_test, scoring ='accuracy', cv = 5)
#gsearch.fit(np.array(points_train), np.array(y_train))
#best_parameters = gsearch.best_estimator_.get_params()
#for param_name in sorted(param_test.keys()):
    #print("\t%s: %r" % (param_name, best_parameters[param_name]))

# start = time.time()
# xgb4 = XGBClassifier(
#  objective = 'multi:softmax',
#  num_class = 22,
#  seed = 1000,
#  colsample_bytree=0.6,
#  subsample=0.7)

# modelfit(xgb4, np.array(points_train), np.array(y_train))
# finish = time.time()
# print("Prediction on train_set done in %0.3fs" % (finish-start)) 

# start = time.time()
# preds = xgb4.predict(points_test)
# acc_pred = metrics.accuracy_score(preds, y_test)
# finish = time.time()
# print("Prediction on test_set done in %0.3fs" % (finish - start))
# print("Test_set accurarcy is %0.3f" %acc_pred)

## We use the best parameters above because the accuracy increases and the prediction time 
## decreases. Then, we tune other parameter based on the xgb4.

##tune reg_alpha parameter
#param_test4 = {
#  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
# }
# gsearch4 = GridSearchCV(xgb4, param_grid = param_test4, scoring ='accuracy', cv = 5)
# gsearch4.fit(np.array(points_train), np.array(y_train))
# best_parameters4 = gsearch4.best_estimator_.get_params()
# for param_name in sorted(param_test4.keys()):
#     print("\t%s: %r" % (param_name, best_parameters4[param_name]))

# start = time.time()
# xgb5 = XGBClassifier(
#  objective= 'multi:softmax',
#  num_class= 22,
#  seed=1000,
#  colsample_bytree=0.7,
#  subsample=0.6,
#  reg_alpha=1)

# modelfit(xgb5, np.array(points_train), np.array(y_train))
# finish = time.time()
# print("Prediction on train_set done in %0.3fs" % (finish-start)) 

# start = time.time()
# preds = xgb5.predict(points_test)
# acc_pred = metrics.accuracy_score(preds, y_test)
# finish = time.time()
# print("Prediction on test_set done in %0.3fs" % (finish - start))
# print("Test_set accurarcy is %0.3f" %acc_pred)

##Since the best parameter of reg_alpha=1e-05, and the accuracy is so close to that of the former 
##model, we decide to use the xgb5 as our final model.

## Part 3: The improved XGboost model after tuning the parameters


In [0]:
####XGBOOSTING improved model
start = time.time()
xgb5 = XGBClassifier(
 objective= 'multi:softmax',
 num_class= 22,
 seed=1000,
 colsample_bytree=0.6,
 subsample=0.7,
 reg_alpha=1)

modelfit(xgb5, np.array(points_train), np.array(y_train))
finish = time.time()
print("Prediction on train_set done in %0.3fs" % (finish-start)) 


Model Report
Accuracy : 0.9995
Prediction on train_set done in 483.013s


In [0]:
start = time.time()
preds = xgb5.predict(points_test)
acc_pred = metrics.accuracy_score(preds, y_test)
finish = time.time()
print("Prediction on test_set done in %0.3fs" % (finish - start))
print("Test_set accurarcy is %0.3f" %acc_pred)

Prediction on test_set done in 0.770s
Test_set accurarcy is 0.502


# III. BAGGING-LOG MODEL

Notation on These functions:
1. extract_mat(): 

  **TAKES IN** a list returned by a loadmat function. 
  
  **RETURN** an array that have all the points in the mat file.

2. get_f(): 

  **TAKES IN** a direction that contains a _single_ .mat file. 
  
  **RETURN** an ndarray contains the pairwise euclidian distance between the coordinate contains in the .mat file.

3. feature_extraction(): 
  
  **TAKES IN** a direction that contains the direction that contains _all_ the .mat file for the train. 
  
  **RETURNS** a ndarray contains the train_x with features set as pairwise euclidian distance between the coordinate contains in the .mat file.

4. f_pca(): 

  **TAKES IN** a ndarray contains all the train_x. 
  
  **RETURNS** a ndarray contains decomposed x and the decompositon model.

5. BaggingLR_w_pca(): 

  **TAKES IN** two ndarrays as train_x(without decomposition) and train_y. 
  
  **RETURNS** the Logistic-Bagging model trained with decomposed-train_x and train_y.

6. claim_possible_acc_BL(): 

  **TAKES IN** three arguments which is the direction that contains _all_the .mat file for x, the direction that contains a file named *label.csv* that have a column named as emotion_idx as the train_y. 
  
  **RETURNS** the possible accuracy of the Logistic-Bagging model.



In [0]:
def extract_mat(x):
    v = list(x.keys())[-1]
    return x[v]

def get_f(file_dir):
    '''Argument: 
        file_dir: The whole direction contain the exact mat file
       
       Return:
        a np.array contains the featrues of single X'''
    a = extract_mat(loadmat(file_dir))
    b = cdist(a, a)
    r = b[np.triu_indices(b.shape[1], 1)].flatten()
    return r

def feature_extraction(dir_x):
    if (dir_x[-1] != '/'):
        dir_x = dir_x + '/'
    
    fea_start = time.time()
    
    filenames = list(os.listdir(dir_x))
    filenames.sort()
    X = np.array(list(map(get_f, ((dir_x + i) for i in filenames))))
    
    fea_end = time.time()
    fea_time = fea_end - fea_start
    
    print('Feature Extraction Completed!')
    print(f'Feature Extraction Cost: {fea_time: 0.2f} Seconds')
    return X

def f_pca(x):
    my_pca = PCA(n_components = 130)
    new_X = my_pca.fit_transform(x)
    compo = sum(my_pca.explained_variance_ratio_)*100
    print(f'The Decomposition take up {compo: 0.2f}% Information of original Data')
    
    return new_X, my_pca

def BaggingLR_w_pca(train_X, train_y):
    
    train_X, pca_mode = f_pca(train_X)    
    
    start_lr = time.time()
    lr = LogisticRegression(C = 1,
                            penalty = 'l2',
                            fit_intercept = False)
    Bag_lr = BaggingClassifier(lr,
                               n_estimators = 70,
                               n_jobs = 5,
                               bootstrap_features = True,
                               verbose = 7)
    Bag_lr.fit(train_X, train_y)
    end_lr = time.time()
    
    Train_time = end_lr - start_lr
    print(f'The Time for train is: {Train_time: 0.2f} Seconds')
    return Bag_lr, pca_mode


def claim_possible_acc_BL(X_path, y_path, n_iter = 1):
    X = feature_extraction(X_path)
    y = pd.read_csv(y_path).emotion_idx
    
    accs = []
    for i in range(n_iter):
        trainx, testx, trainy, testy = train_test_split(X, y, test_size = .2)
        model, pca_mode= BaggingLR_w_pca(trainx, trainy)
        new_testx = pca_mode.transform(testx)
        testy_hat = model.predict(new_testx)
        accs.append(accuracy_score(testy, testy_hat))
    return np.mean(accs)

In [0]:
claim_possible_acc_BL('train_set/points', 'train_set/label.csv')

reference: https://www.cnblogs.com/wj-1314/p/10422159.html