In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data.csv')

In [21]:
X = df.iloc[:,:-1].astype('float32').to_numpy()
y = df['emotion'].to_numpy()

In [22]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=.2)

In [24]:
lab_encoding = LabelEncoder()
processed_y_train = lab_encoding.fit_transform(y_train)
processed_y_test = lab_encoding.transform(y_test)

In [25]:
p_line = Pipeline([
    ('MinMax', MinMaxScaler()),
    ('PCA', PCA(n_components=400))
])

In [26]:
processed_X_train = p_line.fit_transform(X_train)
processed_X_test = p_line.transform(X_test)

# Logistic Model

In [9]:
# lr = LogisticRegression(max_iter=9999, n_jobs=-1)
# lr.fit(processed_X_train,processed_y_train)

In [10]:
# lr_pred = lr.predict(processed_X_test)

# SVM Model

In [11]:
# %%time
# svm = SVC()
# svm.fit(processed_X_train,processed_y_train)

In [12]:
# svm_pred = svm.predict(processed_X_test)

# XGBoost Model

In [27]:
%%time
boost = XGBClassifier()
boost.fit(processed_X_train,processed_y_train)

CPU times: user 11min 23s, sys: 918 ms, total: 11min 24s
Wall time: 1min 38s


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [28]:
boost_pred = boost.predict(processed_X_test)

In [None]:
bo

In [29]:
accuracy_score(boost_pred,processed_y_test)

0.8284883720930233

In [35]:
print(classification_report(processed_y_test, boost_pred))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81      1022
           1       0.83      0.80      0.81      1147
           2       1.00      0.20      0.34        64
           3       0.82      0.65      0.72       673
           4       0.85      0.96      0.90      1566

    accuracy                           0.83      4472
   macro avg       0.86      0.69      0.72      4472
weighted avg       0.83      0.83      0.82      4472



In [36]:
lab_encoding = LabelEncoder()
y_ = lab_encoding.fit_transform(y)

In [51]:
label_mapping = dict(zip(lab_encoding.classes_, lab_encoding.transform(lab_encoding.classes_)))
label_mapping

{'Angry': np.int64(0),
 'Fear': np.int64(1),
 'Happy': np.int64(2),
 'Sad': np.int64(3),
 'Suprise': np.int64(4)}

In [37]:
p_line = Pipeline([
    ('MinMax', MinMaxScaler()),
    ('PCA', PCA(n_components=400))
])

In [38]:
X_ = p_line.fit_transform(X)

In [39]:
boost = XGBClassifier()
boost.fit(X_,y_)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [41]:
boost.score(X_,y_)

0.9747907877446573

In [42]:
import pickle

In [43]:
pickle.dump(boost, open('model.pkl','wb'))
pickle.dump(lab_encoding, open('labels.pkl','wb'))
pickle.dump(p_line, open('pipline.pkl','wb'))

In [None]:
p_line.transform

# GradientBoostingClassifier

In [31]:
%%time
gboost = GradientBoostingClassifier()
gboost.fit(processed_X_train,processed_y_train)

CPU times: user 24min 8s, sys: 297 ms, total: 24min 9s
Wall time: 24min 9s


0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [32]:
gboost_pred = gboost.predict(processed_X_test)

In [33]:
accuracy_score(gboost_pred,processed_y_test)

0.5894454382826476

# Meta Learner

In [15]:
# meta_model = GradientBoostingClassifier()

In [16]:
# %%time
# meta_x_train = np.concat((lr_pred.reshape(processed_X_test.shape[0],1),
#                           svm_pred.reshape(processed_X_test.shape[0],1),
#                           boost_pred.reshape(processed_X_test.shape[0],1)
#                          ),axis=1)

In [17]:
# %%time
# meta_model.fit(meta_x_train,processed_y_test)

# Stacking

In [39]:
estimators = [
    ('svc', SVC()),
    ('lgc', LogisticRegression()),
    ('xgboost', XGBClassifier())
             ]

In [40]:
%%time
clf = StackingClassifier(estimators=estimators,
                         final_estimator=GradientBoostingClassifier(),
                         stack_method='predict',
                         cv=10,
                         n_jobs=-1)

CPU times: user 60 μs, sys: 1 μs, total: 61 μs
Wall time: 76.5 μs


In [21]:
# %%time
# clf.fit(processed_X_train,processed_y_train)

CPU times: user 21.3 s, sys: 6.04 s, total: 27.4 s
Wall time: 1h 28min 39s


0,1,2
,estimators,"[('svc', ...), ('lgc', ...), ...]"
,final_estimator,GradientBoostingClassifier()
,cv,10
,stack_method,'predict'
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [22]:
# clf.score(processed_X_train,processed_y_train)

0.9465982721382289

In [23]:
# clf.score(processed_X_test,processed_y_test)

0.6650107991360691

In [41]:
param_grid = {
    # SVC
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'svc__class_weight': [None, 'balanced'],
    # Logistic Regression
    'lgc__penalty': ['l1', 'l2'],
    'lgc__C': [0.01, 0.1, 1, 10, 100],
    'lgc__solver': ['liblinear', 'saga'],    # allows L1
    'lgc__class_weight': [None, 'balanced'],
    # XGBoost
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__min_child_weight': [1, 3, 5],
    'xgboost__gamma': [0, 0.1, 0.3],
    'xgboost__subsample': [0.6, 0.8, 1.0],
    'xgboost__colsample_bytree': [0.6, 0.8, 1.0],
    'xgboost__learning_rate': [0.01, 0.05, 0.1],
    'xgboost__n_estimators': [200, 400, 600],
    'xgboost__reg_alpha': [0, 0.1, 1],
    'xgboost__reg_lambda': [1, 5, 10],
    'xgboost__scale_pos_weight': [1, 5, 10],
    # Gradient Boosting Classifier
    'final_estimator__n_estimators': [100, 200, 300, 500],
    'final_estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'final_estimator__max_depth': [2, 3, 4, 5],
    'final_estimator__min_samples_split': [2, 5, 10, 20],
    'final_estimator__min_samples_leaf': [1, 2, 4, 8],
    'final_estimator__subsample': [0.5, 0.7, 0.9, 1.0],
    'final_estimator__max_features': ['auto', 'sqrt', 'log2', None],
    'final_estimator__criterion': ['friedman_mse', 'squared_error']
}

In [42]:
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_grid,
    n_iter=100,
    scoring='accuracy',
    cv=2,
    n_jobs=-1,
    verbose=3
)

In [None]:
lab_encoding = LabelEncoder()
y = lab_encoding.fit_transform(y_resampled)
X = p_line.fit_transform(X_resampled)
random_search.fit(X,y)

Fitting 2 folds for each of 100 candidates, totalling 200 fits


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[CV 2/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.2, final_estimator__max_depth=5, final_estimator__max_features=None, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=20, final_estimator__n_estimators=300, final_estimator__subsample=0.7, lgc__C=100, lgc__class_weight=None, lgc__penalty=l2, lgc__solver=liblinear, svc__C=10, svc__class_weight=None, svc__gamma=scale, svc__kernel=rbf, xgboost__colsample_bytree=0.8, xgboost__gamma=0.1, xgboost__learning_rate=0.01, xgboost__max_depth=7, xgboost__min_child_weight=1, xgboost__n_estimators=600, xgboost__reg_alpha=1, xgboost__reg_lambda=1, xgboost__scale_pos_weight=5, xgboost__subsample=1.0;, score=0.729 total time=478.5min
[CV 1/2] END final_estimator__criterion=squared_error, final_estimator__learning_rate=0.1, final_estimator__max_depth=5, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=10, final_estimator__n_estimators=100

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[CV 1/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.2, final_estimator__max_depth=5, final_estimator__max_features=None, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=20, final_estimator__n_estimators=300, final_estimator__subsample=0.7, lgc__C=100, lgc__class_weight=None, lgc__penalty=l2, lgc__solver=liblinear, svc__C=10, svc__class_weight=None, svc__gamma=scale, svc__kernel=rbf, xgboost__colsample_bytree=0.8, xgboost__gamma=0.1, xgboost__learning_rate=0.01, xgboost__max_depth=7, xgboost__min_child_weight=1, xgboost__n_estimators=600, xgboost__reg_alpha=1, xgboost__reg_lambda=1, xgboost__scale_pos_weight=5, xgboost__subsample=1.0;, score=0.677 total time=480.9min
[CV 2/2] END final_estimator__criterion=squared_error, final_estimator__learning_rate=0.1, final_estimator__max_depth=5, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=10, final_estimator__n_estimators=100

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[CV 1/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.2, final_estimator__max_depth=3, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=2, final_estimator__n_estimators=300, final_estimator__subsample=1.0, lgc__C=10, lgc__class_weight=None, lgc__penalty=l1, lgc__solver=liblinear, svc__C=1, svc__class_weight=None, svc__gamma=1, svc__kernel=linear, xgboost__colsample_bytree=1.0, xgboost__gamma=0.3, xgboost__learning_rate=0.01, xgboost__max_depth=7, xgboost__min_child_weight=5, xgboost__n_estimators=400, xgboost__reg_alpha=0.1, xgboost__reg_lambda=5, xgboost__scale_pos_weight=5, xgboost__subsample=1.0;, score=0.505 total time=195.8min
[CV 1/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.1, final_estimator__max_depth=3, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=20, final_estimator__n_estimators=300, f

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[CV 1/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.01, final_estimator__max_depth=5, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=1, final_estimator__min_samples_split=2, final_estimator__n_estimators=100, final_estimator__subsample=1.0, lgc__C=1, lgc__class_weight=balanced, lgc__penalty=l1, lgc__solver=saga, svc__C=1, svc__class_weight=None, svc__gamma=auto, svc__kernel=linear, xgboost__colsample_bytree=0.6, xgboost__gamma=0, xgboost__learning_rate=0.01, xgboost__max_depth=3, xgboost__min_child_weight=5, xgboost__n_estimators=200, xgboost__reg_alpha=0.1, xgboost__reg_lambda=5, xgboost__scale_pos_weight=10, xgboost__subsample=0.6;, score=0.412 total time=393.6min
[CV 1/2] END final_estimator__criterion=squared_error, final_estimator__learning_rate=0.2, final_estimator__max_depth=2, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=8, final_estimator__min_samples_split=2, final_estimator__n_estimators=500, 

Parameters: { "scale_pos_weight" } are not used.



[CV 2/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.2, final_estimator__max_depth=3, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=2, final_estimator__n_estimators=300, final_estimator__subsample=1.0, lgc__C=10, lgc__class_weight=None, lgc__penalty=l1, lgc__solver=liblinear, svc__C=1, svc__class_weight=None, svc__gamma=1, svc__kernel=linear, xgboost__colsample_bytree=1.0, xgboost__gamma=0.3, xgboost__learning_rate=0.01, xgboost__max_depth=7, xgboost__min_child_weight=5, xgboost__n_estimators=400, xgboost__reg_alpha=0.1, xgboost__reg_lambda=5, xgboost__scale_pos_weight=5, xgboost__subsample=1.0;, score=0.497 total time=195.7min
[CV 2/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.1, final_estimator__max_depth=3, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=2, final_estimator__min_samples_split=20, final_estimator__n_estimators=300, f

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[CV 2/2] END final_estimator__criterion=friedman_mse, final_estimator__learning_rate=0.01, final_estimator__max_depth=5, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=1, final_estimator__min_samples_split=2, final_estimator__n_estimators=100, final_estimator__subsample=1.0, lgc__C=1, lgc__class_weight=balanced, lgc__penalty=l1, lgc__solver=saga, svc__C=1, svc__class_weight=None, svc__gamma=auto, svc__kernel=linear, xgboost__colsample_bytree=0.6, xgboost__gamma=0, xgboost__learning_rate=0.01, xgboost__max_depth=3, xgboost__min_child_weight=5, xgboost__n_estimators=200, xgboost__reg_alpha=0.1, xgboost__reg_lambda=5, xgboost__scale_pos_weight=10, xgboost__subsample=0.6;, score=0.440 total time=423.5min
[CV 2/2] END final_estimator__criterion=squared_error, final_estimator__learning_rate=0.2, final_estimator__max_depth=2, final_estimator__max_features=sqrt, final_estimator__min_samples_leaf=8, final_estimator__min_samples_split=2, final_estimator__n_estimators=500, 

Parameters: { "scale_pos_weight" } are not used.

