## Stacking
## score

### 1. Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score


### 2. Data

In [2]:
train = pd.read_csv('../Data/train_with_BMI.csv')
test = pd.read_csv('../Data/test_with_BMI.csv')
val = pd.read_csv('../Data/val_with_BMI.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

## Model
### 3. Models with the best parameters

In [3]:
# define base models
XGB = XGBClassifier(random_state = 42, colsample_bytree = 0.5, learning_rate = 0.01, max_depth = 7, n_estimators = 1000, 
                            subsample = 0.5)

RF = RandomForestClassifier(criterion='entropy', max_depth=20,min_samples_split=6,n_estimators=300,n_jobs=-1 , random_state=42, max_features='sqrt')

DT = DecisionTreeClassifier(criterion='gini', max_depth=50, min_samples_split=5, max_leaf_nodes=100, random_state=42)

SVM = SVC(C = 10, gamma= 0.01, kernel='rbf')

LR = LogisticRegression(random_state=1, C = 100, max_iter= 500, penalty= 'l2', solver='lbfgs')

NB = GaussianNB(var_smoothing=0.002848035868435802)

SGD = SGDClassifier(random_state=42, alpha=0.000774263682681127, loss='log_loss', l1_ratio=0.2, penalty='elasticnet')
 
KN = KNeighborsClassifier(n_neighbors=5)

AB = AdaBoostClassifier(n_estimators=50, learning_rate=0.1)


### 4. Stacking with base top 3 models

In [12]:
top_3_models = [('XGB', XGB), ('RF', RF), ('DT', DT)]


stack = StackingClassifier(estimators=top_3_models, final_estimator=RandomForestClassifier(random_state=42), cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

print('\n ***************************** \n')

print('Stacking Test CV Accuracy: ', stack_cv_test)
print('Stacking Validation CV Accuracy: ', stack_cv_val)
print('\n ***************************** \n')
print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.94      0.93       230
           1       0.88      0.87      0.88       283
           2       0.90      0.85      0.87       254
           3       0.96      0.97      0.97       291
           4       1.00      1.00      1.00       370
           5       0.76      0.78      0.77       201
           6       0.79      0.80      0.79       240

    accuracy                           0.90      1869
   macro avg       0.88      0.89      0.89      1869
weighted avg       0.90      0.90      0.90      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       525
           1       0.86      0.88      0.87       624
           2       0.87      0.84      0.85       604
           3       0.95      0.97      0.96       714
           4       0.99      1.00      1.00       837
           5

### 5. Stacking with base top 6 models

In [17]:
top_6_models = [('XGB', XGB), ('RF', RF), ('DT', DT), ('SVM', SVM), ('LR', LR), ('NB', NB)]


stack = StackingClassifier(estimators=top_6_models, final_estimator=RandomForestClassifier(random_state=42), cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

# stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
# stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

# print('\n ***************************** \n')

# print('Stacking Test CV Accuracy: ', stack_cv_test)
# print('Stacking Validation CV Accuracy: ', stack_cv_val)
# print('\n ***************************** \n')
# print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
# print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       230
           1       0.89      0.86      0.88       283
           2       0.87      0.86      0.87       254
           3       0.97      0.97      0.97       291
           4       1.00      1.00      1.00       370
           5       0.74      0.76      0.75       201
           6       0.79      0.78      0.78       240

    accuracy                           0.89      1869
   macro avg       0.88      0.88      0.88      1869
weighted avg       0.89      0.89      0.89      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       525
           1       0.86      0.87      0.86       624
           2       0.86      0.83      0.85       604
           3       0.95      0.97      0.96       714
           4       1.00      1.00      1.00       837
           5

### 6. Stacking with base all models

In [14]:
models = [('XGB', XGB), ('RF', RF), ('DT', DT), ('SVM', SVM), ('LR', LR), ('NB', NB), ('SGD', SGD), ('KN', KN), ('AB', AB)]


stack = StackingClassifier(estimators=top_3_models, final_estimator=RandomForestClassifier(random_state=42), cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

print('\n ***************************** \n')

print('Stacking Test CV Accuracy: ', stack_cv_test)
print('Stacking Validation CV Accuracy: ', stack_cv_val)
print('\n ***************************** \n')
print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.94      0.93       230
           1       0.88      0.87      0.88       283
           2       0.90      0.85      0.87       254
           3       0.96      0.97      0.97       291
           4       1.00      1.00      1.00       370
           5       0.76      0.78      0.77       201
           6       0.79      0.80      0.79       240

    accuracy                           0.90      1869
   macro avg       0.88      0.89      0.89      1869
weighted avg       0.90      0.90      0.90      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       525
           1       0.86      0.88      0.87       624
           2       0.87      0.84      0.85       604
           3       0.95      0.97      0.96       714
           4       0.99      1.00      1.00       837
           5

In [16]:
print('Validation Classification Report: \n', classification_report(y_val, val_preds))


Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       525
           1       0.86      0.88      0.87       624
           2       0.87      0.84      0.85       604
           3       0.95      0.97      0.96       714
           4       0.99      1.00      1.00       837
           5       0.74      0.75      0.75       532
           6       0.75      0.76      0.75       523

    accuracy                           0.88      4359
   macro avg       0.87      0.87      0.87      4359
weighted avg       0.88      0.88      0.88      4359



## using LR

### 7. using top 3 models

In [4]:
top_3_models = [('XGB', XGB), ('RF', RF), ('DT', DT)]


stack = StackingClassifier(estimators=top_3_models, 
                           final_estimator= LogisticRegression(random_state=1, C = 100, max_iter= 500, penalty= 'l2', solver='lbfgs'), 
                           cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

print('\n ***************************** \n')

print('Stacking Test CV Accuracy: ', stack_cv_test)
print('Stacking Validation CV Accuracy: ', stack_cv_val)
print('\n ***************************** \n')
print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       230
           1       0.89      0.88      0.88       283
           2       0.86      0.85      0.86       254
           3       0.97      0.97      0.97       291
           4       1.00      1.00      1.00       370
           5       0.74      0.78      0.76       201
           6       0.79      0.77      0.78       240

    accuracy                           0.89      1869
   macro avg       0.88      0.88      0.88      1869
weighted avg       0.89      0.89      0.89      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.90      0.92       525
           1       0.85      0.88      0.87       624
           2       0.87      0.83      0.85       604
           3       0.95      0.96      0.96       714
           4       1.00      1.00      1.00       837
           5

### 8. using all models 

In [5]:
models = [('XGB', XGB), ('RF', RF), ('DT', DT), ('SVM', SVM), ('LR', LR), ('NB', NB), ('SGD', SGD), ('KN', KN), ('AB', AB)]


stack = StackingClassifier(estimators=top_3_models,  final_estimator=LogisticRegression(random_state=1, C = 100, max_iter= 500, penalty= 'l2', solver='lbfgs')
, cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

print('\n ***************************** \n')

print('Stacking Test CV Accuracy: ', stack_cv_test)
print('Stacking Validation CV Accuracy: ', stack_cv_val)
print('\n ***************************** \n')
print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       230
           1       0.89      0.88      0.88       283
           2       0.86      0.85      0.86       254
           3       0.97      0.97      0.97       291
           4       1.00      1.00      1.00       370
           5       0.74      0.78      0.76       201
           6       0.79      0.77      0.78       240

    accuracy                           0.89      1869
   macro avg       0.88      0.88      0.88      1869
weighted avg       0.89      0.89      0.89      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.90      0.92       525
           1       0.85      0.88      0.87       624
           2       0.87      0.83      0.85       604
           3       0.95      0.96      0.96       714
           4       1.00      1.00      1.00       837
           5

## using XGBOOST

### top 3 models

In [6]:
models = [('XGB', XGB), ('RF', RF), ('DT', DT), ('SVM', SVM), ('LR', LR), ('NB', NB), ('SGD', SGD), ('KN', KN), ('AB', AB)]


stack = StackingClassifier(estimators=top_3_models,  final_estimator=XGBClassifier(random_state = 42, colsample_bytree = 0.5, learning_rate = 0.01, max_depth = 7, n_estimators = 1000, 
                            subsample = 0.5), cv=5)

stack.fit(X_train, y_train)

test_preds = stack.predict(X_test)
val_preds = stack.predict(X_val)

print('Test Classification Report: \n', classification_report(y_test, test_preds))
print('Validation Classification Report: \n', classification_report(y_val, val_preds))

stack_cv_test = cross_val_score(stack, X_test, y_test, cv=5, scoring='accuracy')
stack_cv_val = cross_val_score(stack, X_val, y_val, cv=5, scoring='accuracy')

print('\n ***************************** \n')

print('Stacking Test CV Accuracy: ', stack_cv_test)
print('Stacking Validation CV Accuracy: ', stack_cv_val)
print('\n ***************************** \n')
print('Stacking Test CV Mean Accuracy: ', stack_cv_test.mean())
print('Stacking Validation CV Mean Accuracy: ', stack_cv_val.mean())

Test Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.93      0.92       230
           1       0.89      0.88      0.88       283
           2       0.88      0.85      0.87       254
           3       0.97      0.96      0.96       291
           4       1.00      1.00      1.00       370
           5       0.76      0.76      0.76       201
           6       0.76      0.79      0.77       240

    accuracy                           0.89      1869
   macro avg       0.88      0.88      0.88      1869
weighted avg       0.89      0.89      0.89      1869

Validation Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.91      0.93       525
           1       0.86      0.89      0.87       624
           2       0.87      0.82      0.85       604
           3       0.95      0.97      0.96       714
           4       1.00      1.00      1.00       837
           5

>not cool, would not recommend