In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

import pickle

In [2]:
df = pd.read_csv("../data/final_data.csv")

In [3]:
df.head()

Unnamed: 0,type,product,per_serving_g,total_fat_g,sugars_g,carbohydrate_g,protein_g,sodium_g,salt_g,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,2.6,4.6,7.6,15.9,2.6,0.0,0.0,1.769231,2.923077,0.0
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,1.4,9.9,5.1,12.4,1.4,0.0,0.0,7.071429,3.642857,0.0
2,cookie,Beryl's Cookies Chocolate Sable,25.0,7.2,9.2,16.3,1.6,0.0,0.0,0.288,0.368,0.0
3,cookie,Beryl's Strawberry Sable,1.4,9.4,4.8,13.7,1.4,0.0,0.0,6.714286,3.428571,0.0
4,cookie,Beryl's Cookies Exquisite Selection (Tin),1.7,4.0,5.0,11.7,1.7,0.0,0.0,2.352941,2.941176,0.0


In [4]:
df.describe()

Unnamed: 0,per_serving_g,total_fat_g,sugars_g,carbohydrate_g,protein_g,sodium_g,salt_g,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
count,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0
mean,58.442553,11.346809,10.892553,29.396457,3.601064,0.109212,0.135532,0.457711,0.406226,0.002722
std,55.876333,9.295596,12.780447,23.548498,3.263664,0.14257,0.46359,1.158668,1.122792,0.002879
min,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.25,5.0,1.0,14.25,1.6,0.0,0.0,0.190577,0.02,0.0
50%,30.0,7.35,5.9,19.0,2.0,0.0775,0.0,0.245588,0.2,0.002806
75%,100.0,19.75,12.95,57.225,5.425,0.129,0.0,0.27,0.329853,0.004364
max,400.0,32.4,51.0,84.5,21.5,0.72,4.0,7.071429,9.25,0.01875


In [5]:
# Label any with metrics > 50th percentile of the dataset as unhealthy
def classify_snack(dataframe):
    condition = (
        ((dataframe['total_fat_g_per_gram_of_serving'] > 0.245588) | 
        (dataframe['sugars_g_per_gram_of_serving'] > 0.2)) &
        (dataframe['sodium_g_per_gram_of_serving'] > 0.002806)
    )
    dataframe['class'] = 1  # 1 means healthy, assign all as healthy first
    dataframe.loc[condition, 'class'] = 0  # 0 means unhealthy

# Call the function to classify snacks
classify_snack(df)

In [6]:
df.groupby("class")['class'].count()

class
0    35
1    59
Name: class, dtype: int64

In [7]:
df.head()

Unnamed: 0,type,product,per_serving_g,total_fat_g,sugars_g,carbohydrate_g,protein_g,sodium_g,salt_g,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,cookie,Beryl's Chocolate Orange Cashew Nuts Cookies,2.6,4.6,7.6,15.9,2.6,0.0,0.0,1.769231,2.923077,0.0,1
1,cookie,Beryl's Coconut Sable with Macadamia Nuts,1.4,9.9,5.1,12.4,1.4,0.0,0.0,7.071429,3.642857,0.0,1
2,cookie,Beryl's Cookies Chocolate Sable,25.0,7.2,9.2,16.3,1.6,0.0,0.0,0.288,0.368,0.0,1
3,cookie,Beryl's Strawberry Sable,1.4,9.4,4.8,13.7,1.4,0.0,0.0,6.714286,3.428571,0.0,1
4,cookie,Beryl's Cookies Exquisite Selection (Tin),1.7,4.0,5.0,11.7,1.7,0.0,0.0,2.352941,2.941176,0.0,1


In [8]:
# We will use the following columns to build the classifier model
new_df = df[['total_fat_g_per_gram_of_serving','sugars_g_per_gram_of_serving','sodium_g_per_gram_of_serving','class']]

In [9]:
new_df.head()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving,class
0,1.769231,2.923077,0.0,1
1,7.071429,3.642857,0.0,1
2,0.288,0.368,0.0,1
3,6.714286,3.428571,0.0,1
4,2.352941,2.941176,0.0,1


In [10]:
X = new_df.drop(['class'], axis=1)
y = new_df['class']

In [11]:
X.head()

Unnamed: 0,total_fat_g_per_gram_of_serving,sugars_g_per_gram_of_serving,sodium_g_per_gram_of_serving
0,1.769231,2.923077,0.0
1,7.071429,3.642857,0.0
2,0.288,0.368,0.0
3,6.714286,3.428571,0.0
4,2.352941,2.941176,0.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) 

In [13]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [14]:
pipelines = []
pipelines.append(('LogReg', LogisticRegression()))
pipelines.append(('BernouliNB' ,BernoulliNB()))
pipelines.append(('GaussianNB' ,GaussianNB()))
pipelines.append(('KNearestNeighbour' ,KNeighborsClassifier()))
pipelines.append(('DecisionTree' ,DecisionTreeClassifier()))
pipelines.append(('RandomForest',RandomForestClassifier()))
pipelines.append(('BaggingDecisionTree',BaggingClassifier()))

model_name = []
results = []

for pipe, model in pipelines:

    model_created = model.fit(X_train_resampled,y_train_resampled)
    
    results.append(accuracy_score(y_test,model.predict(X_test)))
    model_name.append(pipe)
    
    msg = model
    msg_1 = "Train classification report"
    msg_2 = "Test classification report"
    
    print(msg)
    print(msg_1)
    print(classification_report(y_train, model_created.predict(X_train)))
    print(msg)
    print(msg_2)
    print(classification_report(y_test, model_created.predict(X_test)))
    print("===========================================")
    

LogisticRegression()
Train classification report
              precision    recall  f1-score   support

           0       0.39      0.52      0.44        23
           1       0.56      0.42      0.48        33

    accuracy                           0.46        56
   macro avg       0.47      0.47      0.46        56
weighted avg       0.49      0.46      0.47        56

LogisticRegression()
Test classification report
              precision    recall  f1-score   support

           0       0.30      0.58      0.40        12
           1       0.67      0.38      0.49        26

    accuracy                           0.45        38
   macro avg       0.49      0.48      0.44        38
weighted avg       0.55      0.45      0.46        38

BernoulliNB()
Train classification report
              precision    recall  f1-score   support

           0       0.62      0.87      0.73        23
           1       0.88      0.64      0.74        33

    accuracy                           0.73

In [15]:
all_models = list(zip(model_name,results))
sorted_models = sorted(all_models, key=lambda values: values[1], reverse=True)
print(f" Best model with highest test accuracy score is: {sorted_models[0]}")

 Best model with highest test accuracy score is: ('DecisionTree', 0.9473684210526315)


In [16]:
# Based on the highest test accuracy score, we will use the decision tree classifier.
DT = DecisionTreeClassifier()
DT = DT.fit(X_train_resampled,y_train_resampled)
pickle.dump(DT,open("classifier.pkl","wb"))

In [17]:
with open("classifier.pkl", 'rb') as our_model:
    model = pickle.load(our_model)