In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier


from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

import pickle

In [2]:
df = pd.read_csv("../data/final_data.csv")

In [3]:
df.head()

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,
2,cookie,Beryl's Cookies Chocolate Sable,0.288,0.368,
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,


In [74]:
# Label any with metrics > 50th percentile of the dataset as unhealthy
def classify_snack(dataframe):
    condition = (
        (dataframe['total_fat_g_per_gram_of_serving'] > 0.250000) |
        (dataframe['sugars_g_per_gram_of_serving'] > 0.120000) |
        (dataframe['sodium_g_per_gram_of_serving'] > 0.003456)
    )
    dataframe['class'] = 1  # 1 means healthy, assign all as healthy first
    dataframe.loc[condition, 'class'] = 0  # 0 means unhealthy

# Call the function to classify snacks
classify_snack(df)

In [75]:
df.groupby("class")['class'].count()

class
0    87
1     8
Name: class, dtype: int64

In [76]:
df.head()

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,,0
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,,0
2,cookie,Beryl's Cookies Chocolate Sable,0.288,0.368,,0
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,,0
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,,0


In [77]:
# Using KNN Imputer
categories = df[['type','product']]
categories_dummies = pd.get_dummies(categories)

new_df = df.drop(['type','product'], axis=1)
new_df = pd.concat([new_df, categories_dummies], axis=1)

# Normalise the data to prepare for imputation using KNN
scaler = MinMaxScaler()
new_df = pd.DataFrame(scaler.fit_transform(new_df), columns = new_df.columns)

imputer = KNNImputer(n_neighbors=5)
new_df = pd.DataFrame(imputer.fit_transform(new_df),columns = new_df.columns)

In [78]:
new_df.head()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class,type_cookie,type_crackers,type_cream,type_wafer,product_Aladdin Danish Deluxe Waffle Cones,product_Aladdin Ice Cream Cones,...,product_Quaker Oats Cookies - Chocolate Chip,product_Redondo Cream Wafers - Cappuccino,product_Ritz Crackers Box - Original,product_Ritz Sandwich Biscuits Multipack - Chocolate Cream,product_Sanwa Oat Nutrition Choco,"product_Schar Schar Wafers with Lemon Cream, Gluten Free",product_Serious Food Company Serious Cookies - Chocolate Chip,product_Serious Food Company Serious Cookies - White Choc Macadamia,product_Shoon Fatt Crackers - Light Treat (Cream Special),product_Shoon Fatt Crackers - Sweeties (Sugar)
0,0.250194,0.316008,0.141609,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.393822,0.141609,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.040727,0.039784,0.144672,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.949495,0.370656,0.141609,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.332739,0.317965,0.141609,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
X = new_df.drop(['class'], axis=1)
y = new_df['class']

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) 

In [81]:
df_train = pd.merge(X_train,y_train,left_index=True, right_index=True)

In [82]:
df_test = pd.merge(X_test,y_test,left_index=True, right_index=True)

In [83]:
from sklearn.utils import resample
#create two different dataframe of majority and minority class 
new_df_class_0 = df_train[df_train['class']==0] 
new_df_class_1 = df_train[df_train['class']==1] 

print(new_df_class_0.shape)
print(new_df_class_1.shape)

(55, 103)
(2, 103)


In [84]:
# upsample minority class
df_minority_upsampled = resample(new_df_class_1, 
                                 replace=True,    # sample with replacement
                                 n_samples= 55, # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, new_df_class_0])


In [85]:
X_train_resampled = df_upsampled.drop('class', axis=1)
y_train_resampled = df_upsampled['class']

In [86]:
df_upsampled.groupby("class")['class'].count()

class
0.0    55
1.0    55
Name: class, dtype: int64

In [97]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB,BernoulliNB

from sklearn.model_selection import KFold

pipelines = []
pipelines.append(('LogReg', LogisticRegression()))
pipelines.append(('KNN' ,KNeighborsClassifier()))
pipelines.append(('DT' ,DecisionTreeClassifier()))
pipelines.append(('BerNB' ,BernoulliNB()))
pipelines.append(('GausNB' ,GaussianNB()))
pipelines.append(('RandForest',RandomForestClassifier()))
pipelines.append(('BaggingDT',BaggingClassifier()))
#pipelines.append(('scaledLogReg' , (Pipeline([('scaled' , StandardScaler()),('LogReg' ,LogisticRegression())]))))
#pipelines.append(('scaledKNN' , (Pipeline([('scaled' , StandardScaler()),('KNN' ,KNeighborsClassifier())]))))
#pipelines.append(('scaledDT' , (Pipeline([('scaled' , StandardScaler()),('DT' ,DecisionTreeClassifier())]))))
#pipelines.append(('scaledBerNB' , (Pipeline([('scaled' , StandardScaler()),('BerNB' ,BernoulliNB())]))))
#pipelines.append(('scaledGausNB' , (Pipeline([('scaled' , StandardScaler()),('GausNB' ,GaussianNB())]))))

model_name = []
results = []
for pipe ,model in pipelines:
    kfold = KFold(n_splits=5, shuffle=True,random_state=42)
    crossv_results = cross_val_score(model , X_train_resampled ,y_train_resampled ,cv =kfold , scoring='accuracy')
    crossv_results_test = cross_val_score(model , X_test ,y_test,cv =kfold , scoring='accuracy')
    model_created = model.fit(X_train_resampled,y_train_resampled)
    results.append(crossv_results)
    #model_name.append(pipe)
    msg = "Train %s-> cross_val_accuracy: %f cross_val_std:(%f)" % (model, crossv_results.mean(), crossv_results.std())
    msg_1 = "Test %s-> cross_val_accuracy: %f cross_val_std:(%f)" % (model, crossv_results_test.mean(), crossv_results_test.std())
    msg_2 = "Train classification report"
    msg_3 = "Test classification report"
    
    print(msg)
    print(msg_1)
    print(msg_2)
    print(classification_report(y_train, model_created.predict(X_train)))
    print(msg_3)
    print(classification_report(y_test, model_created.predict(X_test)))
    print("")
    

Train LogisticRegression()-> cross_val_accuracy: 1.000000 cross_val_std:(0.000000)
Test LogisticRegression()-> cross_val_accuracy: 0.839286 cross_val_std:(0.062881)
Train classification report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        55
         1.0       1.00      1.00      1.00         2

    accuracy                           1.00        57
   macro avg       1.00      1.00      1.00        57
weighted avg       1.00      1.00      1.00        57

Test classification report
              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91        32
         1.0       0.00      0.00      0.00         6

    accuracy                           0.84        38
   macro avg       0.42      0.50      0.46        38
weighted avg       0.71      0.84      0.77        38


Train KNeighborsClassifier()-> cross_val_accuracy: 0.981818 cross_val_std:(0.022268)
Test KNeighborsClassifier()-> cross_val_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94        32
         1.0       1.00      0.33      0.50         6

    accuracy                           0.89        38
   macro avg       0.94      0.67      0.72        38
weighted avg       0.91      0.89      0.87        38


Train BernoulliNB()-> cross_val_accuracy: 0.909091 cross_val_std:(0.028748)
Test BernoulliNB()-> cross_val_accuracy: 0.839286 cross_val_std:(0.062881)
Train classification report
              precision    recall  f1-score   support

         0.0       1.00      0.82      0.90        55
         1.0       0.17      1.00      0.29         2

    accuracy                           0.82        57
   macro avg       0.58      0.91      0.59        57
weighted avg       0.97      0.82      0.88        57

Test classification report
              precision    recall  f1-score   support

         0.0       0.86      0.75      0.80        32
         1.0       0.20      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train BaggingClassifier()-> cross_val_accuracy: 0.990909 cross_val_std:(0.018182)
Test BaggingClassifier()-> cross_val_accuracy: 0.814286 cross_val_std:(0.068325)
Train classification report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        55
         1.0       1.00      1.00      1.00         2

    accuracy                           1.00        57
   macro avg       1.00      1.00      1.00        57
weighted avg       1.00      1.00      1.00        57

Test classification report
              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94        32
         1.0       1.00      0.33      0.50         6

    accuracy                           0.89        38
   macro avg       0.94      0.67      0.72        38
weighted avg       0.91      0.89      0.87        38


