In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier


from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

import pickle

In [2]:
df = pd.read_csv("../data/final_data.csv")

In [3]:
df.head()

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,
2,cookie,Beryl's Cookies Chocolate Sable,0.288,0.368,
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,


In [4]:
df.describe()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
count,90.0,85.0,80.0
mean,0.483587,0.461762,0.003506
std,1.178523,1.169641,0.00262
min,0.0,0.0,0.0
25%,0.206897,0.08,0.002107
50%,0.25,0.24,0.003456
75%,0.272045,0.34,0.004619
max,7.071429,9.25,0.01875


In [5]:
#impute missing values with 0

new_df = df.fillna(0)

In [6]:
new_df.describe()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
count,95.0,95.0,95.0
mean,0.458135,0.413155,0.002952
std,1.151878,1.114817,0.002724
min,0.0,0.0,0.0
25%,0.196154,0.039231,0.000333
50%,0.241176,0.2,0.0031
75%,0.27,0.333333,0.004357
max,7.071429,9.25,0.01875


In [7]:
# Label any with metrics > 50th percentile of the dataset as unhealthy
def classify_snack(dataframe):
    condition = (
        ((dataframe['total_fat_g_per_gram_of_serving'] > 0.241176) | 
        (dataframe['sugars_g_per_gram_of_serving'] > 0.2)) &
        (dataframe['sodium_g_per_gram_of_serving'] > 0.003100)
    )
    dataframe['class'] = 1  # 1 means healthy, assign all as healthy first
    dataframe.loc[condition, 'class'] = 0  # 0 means unhealthy

# Call the function to classify snacks
classify_snack(new_df)

In [8]:
new_df.groupby("class")['class'].count()

class
0    34
1    61
Name: class, dtype: int64

In [9]:
new_df.head()

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,0.0,1
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,0.0,1
2,cookie,Beryl's Cookies Chocolate Sable,0.288,0.368,0.0,1
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,0.0,1
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,0.0,1


In [10]:
# Using KNN Imputer
#categories = df[['type']]
#categories_dummies = pd.get_dummies(categories, drop_first=True)

#new_df = df.drop(['type','product'], axis=1)
#new_df = pd.concat([new_df, categories_dummies], axis=1)

# Normalise the data to prepare for imputation using KNN
#scaler = MinMaxScaler()
#new_df = pd.DataFrame(scaler.fit_transform(new_df), columns = new_df.columns)

#imputer = KNNImputer(n_neighbors=5)
#new_df = pd.DataFrame(imputer.fit_transform(new_df),columns = new_df.columns)

In [11]:
new_df.head()

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,0.0,1
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,0.0,1
2,cookie,Beryl's Cookies Chocolate Sable,0.288,0.368,0.0,1
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,0.0,1
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,0.0,1


In [14]:
categories = new_df[['type']]
categories_dummies = pd.get_dummies(categories, drop_first=True)

In [17]:
new_df

Unnamed: 0,type,product,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,1.769231,2.923077,0.000000,1
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,7.071429,3.642857,0.000000,1
2,cookie,Beryl's Cookies Chocolate Sable,0.288000,0.368000,0.000000,1
3,cookie,Beryl's Strawberry Sable,6.714286,3.428571,0.000000,1
4,cookie,Beryl's Cookies Exquisite Selection (Tin),2.352941,2.941176,0.000000,1
...,...,...,...,...,...,...
90,wafer,Loacker Quadratini Crispy Wafers - Napolitaner,0.270000,0.000000,0.000000,1
91,wafer,Loacker Quadratini Crispy Wafers - Tiramisu,0.293333,0.000000,0.000000,1
92,wafer,Lee Biscuits Carton - 24 Pack Chocolate Cream ...,0.296667,0.341111,0.000544,1
93,wafer,Lee Biscuits Carton - 24 Pack Lemon Cream Wafe...,0.283333,0.230000,0.000689,1


In [18]:
new_df = new_df.drop(['type','product'], axis=1)
new_df = pd.concat([new_df, categories_dummies], axis=1)

In [19]:
new_df.head()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class,type_crackers,type_cream,type_wafer
0,1.769231,2.923077,0.0,1,0,0,0
1,7.071429,3.642857,0.0,1,0,0,0
2,0.288,0.368,0.0,1,0,0,0
3,6.714286,3.428571,0.0,1,0,0,0
4,2.352941,2.941176,0.0,1,0,0,0


In [20]:
X = new_df.drop(['class'], axis=1)
y = new_df['class']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) 

In [22]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB,BernoulliNB

from sklearn.model_selection import KFold

pipelines = []
pipelines.append(('LogReg', LogisticRegression()))
pipelines.append(('KNN' ,KNeighborsClassifier()))
pipelines.append(('DT' ,DecisionTreeClassifier()))
pipelines.append(('BerNB' ,BernoulliNB()))
pipelines.append(('GausNB' ,GaussianNB()))
pipelines.append(('RandForest',RandomForestClassifier()))
pipelines.append(('BaggingDT',BaggingClassifier()))

model_name = []
results = []
for pipe ,model in pipelines:
    #kfold = KFold(n_splits=5, shuffle=True,random_state=42)
    #crossv_results = cross_val_score(model , X_train_resampled ,y_train_resampled ,cv =kfold , scoring='accuracy')
    #crossv_results_test = cross_val_score(model , X_test ,y_test,cv =kfold , scoring='accuracy')
    #model_created = model.fit(X_train_resampled,y_train_resampled)
    #results.append(crossv_results)
    #model_name.append(pipe)
    #msg = "Train %s-> cross_val_accuracy: %f cross_val_std:(%f)" % (model, crossv_results.mean(), crossv_results.std())
    #msg_1 = "Test %s-> cross_val_accuracy: %f cross_val_std:(%f)" % (model, crossv_results_test.mean(), crossv_results_test.std())
    msg = model
    msg_1 = "Train classification report"
    msg_2 = "Test classification report"
    
    print(msg)
    print(msg_1)
    print(classification_report(y_train, model_created.predict(X_train)))
    print(msg)
    print(msg_2)
    print(classification_report(y_test, model_created.predict(X_test)))
    print("===========================================")
    

LogisticRegression()
Train classification report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        24
           1       1.00      0.97      0.98        33

    accuracy                           0.98        57
   macro avg       0.98      0.98      0.98        57
weighted avg       0.98      0.98      0.98        57

LogisticRegression()
Test classification report
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        10
           1       1.00      0.86      0.92        28

    accuracy                           0.89        38
   macro avg       0.86      0.93      0.88        38
weighted avg       0.92      0.89      0.90        38

KNeighborsClassifier()
Train classification report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        24
           1       1.00      0.97      0.98        33

    accuracy                      