In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
ulrikthygepedersen_fastfood_nutrition_path = kagglehub.dataset_download('ulrikthygepedersen/fastfood-nutrition')

print('Data source import complete.')


## Fast food analysis
![images.jpg](attachment:1fbf021b-4858-46c7-a956-2a1acf95c55e.jpg)
### Lets look at daily requirements of nutritions (information)
* Calories: 2000-2500 kcal/day for men, and 1600-2000 kcal/day for women.
* Protein: 0.8-1.0 g/kg of body weight/day.
* Carbohydrates: 45-65% of total daily calories, or at least 130 g/day.
* Fat: 20-35% of total daily calories, or at least 20-35 g/day.
* Fiber: 25-38 g/day for men, and 21-25 g/day for women.
* Vitamins: the RDI varies depending on the vitamin, ranging from a few micrograms to several milligrams or more per day.
* Minerals: the RDI also varies depending on the mineral, ranging from a few milligrams to several grams per day.
* Vitamin A: RDI is 900 micrograms (mcg) per day for men, and 700 mcg per day for women.
* Vitamin C: RDI is 90 milligrams (mg) per day for men, and 75 mg per day for women.
* Cholesterol: less than 300 milligrams per day for most adults, and less than 200 milligrams per day for individuals with heart disease or high blood cholesterol levels.
* Sodium: less than 2,300 milligrams per day for most adults, and less than 1,500 milligrams per day for individuals with high blood pressure, kidney disease, or other health conditions.
* Sugar: the American Heart Association recommends limiting added sugar intake to no more than 6 teaspoons (25 grams) per day for women and 9 teaspoons (38 grams) per day for men.
* Calcium: the recommended daily intake varies depending on age and sex, but generally ranges from 1,000 to 1,300 milligrams per day for adults.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Goal is to segment healthy food for life
### This notebook devided into two parts
> 1. Collecting labels `unsuperwised learning`
> 2. Segmenting data into `healthy food and unhealthy food`

### Basic preprocessing

In [None]:
pd.set_option("display.max_column",None)
df = pd.read_csv("/kaggle/input/fastfood-nutrition/fastfood.csv")
df.head(3)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
null_vals = dict(df.isnull().sum())
null_vals

In [None]:
# % null values
for key,val in null_vals.items():
    print(f"null values for {key} =======> {(int(val)/df.shape[0])*100}")

In [None]:
# replace vitamin a ,c and calcium with mean value, for this collecting mean values for these columns
null_cols = ['fiber','protein','vit_a','vit_c','calcium']
null_cols_avg = {}
for col in null_cols:
    null_cols_avg[col] = df[col].describe().mean()
null_cols_avg

In [None]:
df.fillna(value=null_cols_avg,inplace=True)
df.isnull().sum()

In [None]:
# Oh cleared it, now ready to go
# here we will consider only total fat, the reason is I have not got more information
# about trans_fat,sat_fat and cal_fat, I appologise for this, lets move further to our journey
df.drop(['salad','cal_fat','sat_fat','trans_fat'],axis=1,inplace=True)

## 1) Collecting labels `unsuperwised learning`
### Steps followed:
* Collecting libraries
* Dropping unrequired columns
* Finding best cluster
* Visualizing elbow method
* Training model with best cluster
* Getting labels

In [None]:
# collecting libraries required
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
df_seg = df.drop(['restaurant','item'],axis='columns')
df_seg.sample(3)

### ----> Finding best cluster size for segmentation
### ---- > Elbow method with yellowbrick

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, K=(1,10))
visualizer.fit(df_seg)
visualizer.show()

In [None]:
inertias = []
for i in range(1,10):
    model = KMeans(n_clusters=i,init="k-means++",random_state=42)
    model.fit(df_seg)
    inertias.append(model.inertia_)
print(inertias)

### ---- > Elbow method

In [None]:
plt.plot(inertias)
plt.title("inertia of Knn model")
plt.xlabel("number of clusters")
plt.ylabel("inertia")
plt.show()

>As per elbow method 4 types of clusters are best for our model, that means we can classify foods into 3 types.

In [None]:
model = KMeans(n_clusters=4,init='k-means++',random_state=42)
model.fit(df_seg)

In [None]:
cluster_centers = model.cluster_centers_
cluster_centers

In [None]:
labels = model.labels_
labels[:10]

In [None]:
df['labels'] = labels
df.sample(3)

#### Done first part 😌

## 2) Segmenting data into `healthy food and unhealthy food`

In [None]:
import seaborn as sn

#### Droping cal_fat and sat_fat we will consider only total fat for further computation

In [None]:
label_0 = df[df['labels']==0]
label_1 = df[df['labels']==1]
label_2 = df[df['labels']==2]
label_3 = df[df['labels']==3]
print("label 0",len(label_0))
print("label 1",len(label_1))
print("label 2",len(label_2))
print("label 3",len(label_3))

### ----> Collecting average of Nutrition

In [None]:
nutritions = list(label_0.describe().columns)
label_0_nutri_avg = label_0.describe().mean().values
label_1_nutri_avg = label_1.describe().mean().values
label_2_nutri_avg = label_2.describe().mean().values
label_3_nutri_avg = label_3.describe().mean().values

print("label_0_nutri_avg",label_0_nutri_avg)
print("label_1_nutri_avg",label_1_nutri_avg)
print("label_2_nutri_avg",label_2_nutri_avg)
print("label_3_nutri_avg",label_3_nutri_avg)

### ----> Visualization
* label 0
* label 1
* label 2
* label 3

In [None]:
# label 0
sn.barplot(x=nutritions,y=label_0_nutri_avg)
plt.title("Nutritions in average for label 0")
plt.xticks(rotation=90)
plt.ylim([0,3000])
plt.show()

In [None]:
# label 1
sn.barplot(x=nutritions,y=label_1_nutri_avg)
plt.title("Nutritions in average for label 1")
plt.xticks(rotation=90)
plt.ylim([0,3000])
plt.show()

In [None]:
# label 2
sn.barplot(x=nutritions,y=label_2_nutri_avg)
plt.title("Nutritions in average for label 2")
plt.xticks(rotation=90)
plt.ylim([0,3000])
plt.show()

### sodium
>Sodium is a mineral found naturally in foods and also added to foods. Sodium plays an important role in maintaining normal fluid balance in the body. A low-sodium diet is important to follow in order to control your heart failure symptoms and prevent future heart problems.

### fiber
>Better regulates blood sugar levels: A high-fiber meal slows down the digestion of food into the intestines, which may help to keep blood sugars from rising rapidly. Weight control: A high-fiber diet may help keep you fuller longer, which prevents overeating and hunger between meals.

### ----> We will separate out positive nutritions (recomended) and negative nutritions (not recomended) to compare these three clusters.
* positive_ntr = ['protein','calcium','carbohydred','vit_a','vit_c','fiber']
* negative_ntr = ['sugar','calories','total_fat','sodium','cholestrol']
>Here we will calculate average of all nutritions from each clusters and comparing these three clusters together through graph to understand which cluster is better.
* positive_ntr average should be higher
* negative _ntr average should be lower

In [None]:
positive_ntr = ['protein','calcium','total_carb','vit_a','vit_c','fiber']
negative_ntr = ['sugar','calories','total_fat','sodium','cholesterol']

### ----> zipping average values of nutritions and labels (nutritions) in dictionary format

In [None]:
def zipper(lis1,lis2):
    temp = {}
    for v1,v2 in zip(lis1,lis2):
        temp[v1] = v2
    return temp

label_0_dic = zipper(nutritions,label_0_nutri_avg)
label_1_dic = zipper(nutritions,label_1_nutri_avg)
label_2_dic = zipper(nutritions,label_2_nutri_avg)
label_3_dic = zipper(nutritions,label_3_nutri_avg)

print("label_0_zipped",label_0_dic)
print("\n\nlabel_1_zipped",label_1_dic)
print("\n\nlabel_2_zipped",label_2_dic)
print("\n\nlabel_3_zipped",label_3_dic)

### ----> creating dictionary for each label (cluster) with positive nutrition and negative nutrition average values

In [None]:
pn_avg_label_0 = {}
pn_avg_label_1 = {}
pn_avg_label_2 = {}
pn_avg_label_3 = {}
for i in range(3):
    lis1 = []
    lis2 = []
    lis3 = []
    lis4 = []
    for pos_val in positive_ntr:
        lis1.append(label_0_dic[pos_val])
        lis2.append(label_1_dic[pos_val])
        lis3.append(label_2_dic[pos_val])
        lis4.append(label_3_dic[pos_val])
    pn_avg_label_0['pos'] = sum(lis1)/len(lis1)
    pn_avg_label_1['pos'] = sum(lis2)/len(lis2)
    pn_avg_label_2['pos'] = sum(lis3)/len(lis3)
    pn_avg_label_3['pos'] = sum(lis4)/len(lis4)


for i in range(3):
    lis1 = []
    lis2 = []
    lis3 = []
    lis4 = []
    for pos_val in negative_ntr:
        lis1.append(label_0_dic[pos_val])
        lis2.append(label_1_dic[pos_val])
        lis3.append(label_2_dic[pos_val])
        lis4.append(label_3_dic[pos_val])
    pn_avg_label_0['neg'] = sum(lis1)/len(lis1)
    pn_avg_label_1['neg'] = sum(lis2)/len(lis2)
    pn_avg_label_2['neg'] = sum(lis3)/len(lis3)
    pn_avg_label_3['neg'] = sum(lis4)/len(lis4)

In [None]:
pos = [pn_avg_label_0['pos'],pn_avg_label_1['pos'],pn_avg_label_2['pos'],pn_avg_label_3['pos']]
neg = [pn_avg_label_0['neg'],pn_avg_label_1['neg'],pn_avg_label_2['neg'],pn_avg_label_3['neg']]

In [None]:
labels = ["label_0","label_1","label_2","label_3"]

### ----> positive nutrition average (should be large)
picking high value cluster as winner

In [None]:
data = pos

palette_color = sn.color_palette('bright')
explode = [0,0.1,0,0]
plt.pie(data, explode = explode, labels=labels, colors=palette_color, autopct='%.0f%%')
plt.title("positive neutrition distribution")
plt.show()

In [None]:
sn.barplot(x=labels,y=pos)
plt.title("positive nutritions average value for 4 clusters")
plt.xlabel("cluster or labels")
plt.ylabel("average value of positive nutritions")
plt.show()

### ----> negative nutrition average (should be low)
picking low valued cluster as winner

In [None]:
data = neg

palette_color = sn.color_palette('bright')
explode = [0.3,0,0,0]
plt.pie(data, explode=explode, labels=labels, colors=palette_color, autopct='%.0f%%')
plt.title("negative neutrition distribution")
plt.show()

In [None]:
sn.barplot(x = labels,y=neg)
plt.title("negative nutritions average value for 4 clusters")
plt.xlabel("cluster or labels")
plt.ylabel("average value of negative nutritions")
plt.show()

## Now the exciting part (which food should I consume)
>by looking above grphs label 1 having more positive average and less negative average so it get first rank, then label 2 second rank and label 3 will be at last

In [None]:
# label_name = (positive,negative)
label_0_pn = (26,10)
label_1_pn = (29,27)
label_2_pn = (27,18)
label_3_pn = (19,45)
labels_pos_neg = [label_0_pn,label_1_pn,label_2_pn,label_3_pn]

In [None]:
healthy_level = [pos-neg for pos,neg in labels_pos_neg]
healthy_level

## Healthiest foods
**prefer to eat this fast foods rather than others**🧐🍔

In [None]:
label_0['item'].unique()

## Worst foods

In [None]:
label_3['item'].unique()

## Medium healthy food

In [None]:
label_1['item'].unique()

In [None]:
label_2['item'].unique()

## Thanks!