<a href="https://colab.research.google.com/github/KarolinaK-14/ML/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Library page: [https://scikit-learn.org](https://scikit-learn.org)

Documentation/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

The core library for machine learning in Python.

To install the library, use the command below:
```
!pip install scikit-learn
```
To update the library to the latest version, use the command below:
```
!pip install --upgrade scikit-learn
```

### Data preprocessing:
1. [Importing libraries](#0)
2. [Generating data](#1)
3. [Creating a copy of the data](#2)
4. [Changing data types and initial exploration](#3)
5. [LabelEncoder](#4)
6. [OneHotEncoder](#5)
7. [Pandas *get dummies()*](#6)
8. [Standardization - StandarScaler](#7)
9. [Data preprocessing for the model](#8)


### <a name='0'></a> Importing libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.6.1'

### <a name='1'></a> Generating data

In [2]:
# data = {
#     'size': ['XL', 'L', 'M', 'L', 'M'],
#     'color': ['red', 'green', 'blue', 'green', 'red'],
#     'gender': ['female', 'male', 'male', 'female', 'female'],
#     'price': [199.0, 89.0, 99.0, 129.0, 79.0],
#     'weight': [500, 450, 300, 380, 410],
#     'bought': ['yes', 'no', 'yes', 'no', 'yes']
# }

# df_raw = pd.DataFrame(data=data)
# df_raw

data = {
    'Food': ['Ghee', 'Honey', 'Rice', 'Almonds', 'Milk', 'Coconut', 'Dates', 'Banana', 'Mung Beans', 'Yogurt'],
    'Taste': ['Sweet', 'Sweet, Astringent', 'Sweet', 'Sweet', 'Sweet', 'Sweet', 'Sweet', 'Sweet', 'Sweet, Astringent', 'Sour'],
    'Dosha Balanced': ['Vata, Pitta', 'Kapha', 'Pitta, Vata', 'Vata, Pitta', 'Vata, Pitta', 'Pitta, Vata', 'Vata, Pitta', 'Vata, Pitta', 'All', 'Vata'],
    'Primary Benefit': ['Digestion, Immunity', 'Energy, Detoxification', 'Nourishment, Digestion', 'Energy, Cognitive', 'Nourishment, Calming', 'Cooling, Nourishing', 'Energy, Blood-building', 'Energy, Nourishing', 'Detoxification', 'Digestion'],
    'Season': ['All', 'Winter', 'All', 'Autumn', 'Winter', 'Summer', 'Winter', 'All', 'Spring', 'Summer'],
    'Potency': ['Cooling', 'Heating', 'Cooling', 'Heating', 'Cooling', 'Cooling', 'Heating', 'Cooling', 'Cooling', 'Heating'],
    'Calories per 100g': [900, 304, 130, 576, 42, 354, 282, 89, 105, 59],
    'Protein (g)': [0.0, 0.3, 2.7, 21.2, 3.4, 3.3, 2.5, 1.1, 7.0, 10.0],
    'Fiber (g)': [0.0, 0.2, 0.4, 12.5, 0.0, 9.0, 8.0, 2.6, 7.6, 0.0]
}
df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,900,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,304,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,130,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,576,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,42,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,354,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,282,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,89,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,105,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,59,10.0,0.0


### <a name='2'></a> Creating a copy of the data

In [3]:
# df = df_raw.copy()
# df.info()

df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Food               10 non-null     object 
 1   Taste              10 non-null     object 
 2   Dosha Balanced     10 non-null     object 
 3   Primary Benefit    10 non-null     object 
 4   Season             10 non-null     object 
 5   Potency            10 non-null     object 
 6   Calories per 100g  10 non-null     int64  
 7   Protein (g)        10 non-null     float64
 8   Fiber (g)          10 non-null     float64
dtypes: float64(2), int64(1), object(6)
memory usage: 852.0+ bytes


### <a name='3'></a> Changing data types and initial exploration

In [4]:
# for col in ['size', 'color', 'gender', 'bought']:
#   df[col] = df[col].astype('category')

# df['weight'] = df['weight'].astype('float')

# df.info()

for col in ['Food', 'Taste', 'Dosha Balanced', 'Primary Benefit', 'Season', 'Potency']:
  df[col] = df[col].astype('category')

df['Calories per 100g'] = df['Calories per 100g'].astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Food               10 non-null     category
 1   Taste              10 non-null     category
 2   Dosha Balanced     10 non-null     category
 3   Primary Benefit    10 non-null     category
 4   Season             10 non-null     category
 5   Potency            10 non-null     category
 6   Calories per 100g  10 non-null     float64 
 7   Protein (g)        10 non-null     float64 
 8   Fiber (g)          10 non-null     float64 
dtypes: category(6), float64(3)
memory usage: 1.8 KB


In [5]:
df.describe()

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g)
count,10.0,10.0,10.0
mean,284.1,5.15,4.03
std,273.454221,6.416169,4.755827
min,42.0,0.0,0.0
25%,93.0,1.45,0.05
50%,206.0,3.0,1.5
75%,341.5,6.1,7.9
max,900.0,21.2,12.5


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Calories per 100g,10.0,284.1,273.454221,42.0,93.0,206.0,341.5,900.0
Protein (g),10.0,5.15,6.416169,0.0,1.45,3.0,6.1,21.2
Fiber (g),10.0,4.03,4.755827,0.0,0.05,1.5,7.9,12.5


In [7]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
Food,10,10,Almonds,1
Taste,10,3,Sweet,7
Dosha Balanced,10,5,"Vata, Pitta",5
Primary Benefit,10,10,"Cooling, Nourishing",1
Season,10,5,All,3
Potency,10,2,Cooling,6


In [8]:
df.describe(include=['category','float']).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Food,10.0,10.0,Almonds,1.0,,,,,,,
Taste,10.0,3.0,Sweet,7.0,,,,,,,
Dosha Balanced,10.0,5.0,"Vata, Pitta",5.0,,,,,,,
Primary Benefit,10.0,10.0,"Cooling, Nourishing",1.0,,,,,,,
Season,10.0,5.0,All,3.0,,,,,,,
Potency,10.0,2.0,Cooling,6.0,,,,,,,
Calories per 100g,10.0,,,,284.1,273.454221,42.0,93.0,206.0,341.5,900.0
Protein (g),10.0,,,,5.15,6.416169,0.0,1.45,3.0,6.1,21.2
Fiber (g),10.0,,,,4.03,4.755827,0.0,0.05,1.5,7.9,12.5


In [9]:
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,59.0,10.0,0.0


### <a name='4'></a> LabelEncoder

In [10]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# le.fit(df['bought'])
# le.transform(df['bought'])

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['Potency'])
le.transform(df['Potency'])

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 1])

In [11]:
le.fit_transform(df['Potency'])

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 1])

In [12]:
le.classes_

array(['Cooling', 'Heating'], dtype=object)

In [13]:
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,59.0,10.0,0.0


In [14]:
df['Potency'] = le.fit_transform(df['Potency'])
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,0,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,1,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,0,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,1,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,0,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,0,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,1,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,0,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,0,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,1,59.0,10.0,0.0


In [15]:
le.inverse_transform(df['Potency'])

array(['Cooling', 'Heating', 'Cooling', 'Heating', 'Cooling', 'Cooling',
       'Heating', 'Cooling', 'Cooling', 'Heating'], dtype=object)

In [16]:
df['Potency'] = le.inverse_transform(df['Potency'])
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,59.0,10.0,0.0


In [17]:
df['Potency'] = le.fit_transform(df['Potency'])
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,0,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,1,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,0,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,1,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,0,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,0,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,1,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,0,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,0,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,1,59.0,10.0,0.0


### <a name='6'></a> OneHotEncoder

In [18]:
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder(sparse_output=False)
# encoder.fit(df[['size']])

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoder.fit_transform(df[['Season']])

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [19]:
# encoder.transform(df[['size']])
encoder.categories_

[array(['All', 'Autumn', 'Spring', 'Summer', 'Winter'], dtype=object)]

In [20]:
df.describe(include='category')

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season
count,10,10,10,10,10
unique,10,3,5,10,5
top,Almonds,Sweet,"Vata, Pitta","Cooling, Nourishing",All
freq,1,7,5,1,3


In [21]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoder.fit_transform(df[['Season']])

array([[0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [22]:
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,0,900.0,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,1,304.0,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,0,130.0,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,1,576.0,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,0,42.0,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,0,354.0,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,1,282.0,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,0,89.0,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,0,105.0,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,1,59.0,10.0,0.0


In [23]:
df = df_raw.copy()
df

Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,900,0.0,0.0
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,304,0.3,0.2
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,130,2.7,0.4
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,576,21.2,12.5
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,42,3.4,0.0
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,354,3.3,9.0
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,282,2.5,8.0
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,89,1.1,2.6
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,105,7.0,7.6
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,59,10.0,0.0


### <a name='6'></a> Pandas *get dummies()*

In [24]:
pd.get_dummies(data=df)

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g),Food_Almonds,Food_Banana,Food_Coconut,Food_Dates,Food_Ghee,Food_Honey,Food_Milk,...,"Primary Benefit_Energy, Nourishing","Primary Benefit_Nourishment, Calming","Primary Benefit_Nourishment, Digestion",Season_All,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Potency_Cooling,Potency_Heating
0,900,0.0,0.0,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
1,304,0.3,0.2,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,True
2,130,2.7,0.4,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,False,True,False
3,576,21.2,12.5,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
4,42,3.4,0.0,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,True,True,False
5,354,3.3,9.0,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
6,282,2.5,8.0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,True
7,89,1.1,2.6,False,True,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,False
8,105,7.0,7.6,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
9,59,10.0,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True


In [25]:
pd.get_dummies(data=df, drop_first=True)

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g),Food_Banana,Food_Coconut,Food_Dates,Food_Ghee,Food_Honey,Food_Milk,Food_Mung Beans,...,"Primary Benefit_Energy, Cognitive","Primary Benefit_Energy, Detoxification","Primary Benefit_Energy, Nourishing","Primary Benefit_Nourishment, Calming","Primary Benefit_Nourishment, Digestion",Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Potency_Heating
0,900,0.0,0.0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,304,0.3,0.2,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,True
2,130,2.7,0.4,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,576,21.2,12.5,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True
4,42,3.4,0.0,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,True,False
5,354,3.3,9.0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
6,282,2.5,8.0,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
7,89,1.1,2.6,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
8,105,7.0,7.6,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
9,59,10.0,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


In [26]:
pd.get_dummies(data=df, drop_first=True, prefix='coding')

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g),coding_Banana,coding_Coconut,coding_Dates,coding_Ghee,coding_Honey,coding_Milk,coding_Mung Beans,...,"coding_Energy, Cognitive","coding_Energy, Detoxification","coding_Energy, Nourishing","coding_Nourishment, Calming","coding_Nourishment, Digestion",coding_Autumn,coding_Spring,coding_Summer,coding_Winter,coding_Heating
0,900,0.0,0.0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,304,0.3,0.2,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,True
2,130,2.7,0.4,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,576,21.2,12.5,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True
4,42,3.4,0.0,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,True,False
5,354,3.3,9.0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
6,282,2.5,8.0,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
7,89,1.1,2.6,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
8,105,7.0,7.6,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
9,59,10.0,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


In [28]:
pd.get_dummies(data=df, drop_first=True, prefix='coding', prefix_sep='~')

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g),coding~Banana,coding~Coconut,coding~Dates,coding~Ghee,coding~Honey,coding~Milk,coding~Mung Beans,...,"coding~Energy, Cognitive","coding~Energy, Detoxification","coding~Energy, Nourishing","coding~Nourishment, Calming","coding~Nourishment, Digestion",coding~Autumn,coding~Spring,coding~Summer,coding~Winter,coding~Heating
0,900,0.0,0.0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,304,0.3,0.2,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,True
2,130,2.7,0.4,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,576,21.2,12.5,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True
4,42,3.4,0.0,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,True,False
5,354,3.3,9.0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
6,282,2.5,8.0,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
7,89,1.1,2.6,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
8,105,7.0,7.6,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
9,59,10.0,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


In [29]:
pd.get_dummies(data=df, drop_first=True, prefix='coding', prefix_sep='~', columns=['Season', 'Taste'])

Unnamed: 0,Food,Dosha Balanced,Primary Benefit,Potency,Calories per 100g,Protein (g),Fiber (g),coding~Autumn,coding~Spring,coding~Summer,coding~Winter,coding~Sweet,"coding~Sweet, Astringent"
0,Ghee,"Vata, Pitta","Digestion, Immunity",Cooling,900,0.0,0.0,False,False,False,False,True,False
1,Honey,Kapha,"Energy, Detoxification",Heating,304,0.3,0.2,False,False,False,True,False,True
2,Rice,"Pitta, Vata","Nourishment, Digestion",Cooling,130,2.7,0.4,False,False,False,False,True,False
3,Almonds,"Vata, Pitta","Energy, Cognitive",Heating,576,21.2,12.5,True,False,False,False,True,False
4,Milk,"Vata, Pitta","Nourishment, Calming",Cooling,42,3.4,0.0,False,False,False,True,True,False
5,Coconut,"Pitta, Vata","Cooling, Nourishing",Cooling,354,3.3,9.0,False,False,True,False,True,False
6,Dates,"Vata, Pitta","Energy, Blood-building",Heating,282,2.5,8.0,False,False,False,True,True,False
7,Banana,"Vata, Pitta","Energy, Nourishing",Cooling,89,1.1,2.6,False,False,False,False,True,False
8,Mung Beans,All,Detoxification,Cooling,105,7.0,7.6,False,True,False,False,False,True
9,Yogurt,Vata,Digestion,Heating,59,10.0,0.0,False,False,True,False,False,False


### <a name='7'></a> Standardization - StandardScale

std() - pandas unbiased  
std() - numpy biased

In [39]:
# print(f"{df['price']}\n")
# print(f"{df['price'] - df['price'].mean()}\n")
# print(f"{df['price'] / df['price'].std()}\n")

print(f"{df['Protein (g)']}\n")
print(f"Average: {df['Protein (g)'].mean()}\n")
print(f"Standard deviation: {df['Protein (g)'].std():.2f}")

0     0.0
1     0.3
2     2.7
3    21.2
4     3.4
5     3.3
6     2.5
7     1.1
8     7.0
9    10.0
Name: Protein (g), dtype: float64

Average: 5.15

Standard deviation: 6.42


In [40]:
df.describe()

Unnamed: 0,Calories per 100g,Protein (g),Fiber (g)
count,10.0,10.0,10.0
mean,284.1,5.15,4.03
std,273.454221,6.416169,4.755827
min,42.0,0.0,0.0
25%,93.0,1.45,0.05
50%,206.0,3.0,1.5
75%,341.5,6.1,7.9
max,900.0,21.2,12.5


In [41]:
# (df['price'] - df['price'].mean()) / df['price'].std()
(df['Protein (g)'] - df['Protein (g)'].mean()) /df['Protein (g)'].std()

Unnamed: 0,Protein (g)
0,-0.80266
1,-0.755903
2,-0.381848
3,2.501493
4,-0.272748
5,-0.288334
6,-0.413019
7,-0.631218
8,0.288334
9,0.755903


In [46]:
# def standardize(x):
#   return (x - x.mean()) / x.std()

# standardize(df['price'])

def standardize(x):
  return (x - x.mean()) / x.std()

standardize(df['Protein (g)'])

Unnamed: 0,Protein (g)
0,-0.80266
1,-0.755903
2,-0.381848
3,2.501493
4,-0.272748
5,-0.288334
6,-0.413019
7,-0.631218
8,0.288334
9,0.755903


In [51]:
# from sklearn.preprocessing import scale

# scale(df['price'])

from sklearn.preprocessing import scale, StandardScaler

# scale(df['Protein (g)'])

scaler = StandardScaler()
scaler.fit_transform(df[['Protein (g)']])

array([[-0.84607759],
       [-0.79679152],
       [-0.40250293],
       [ 2.63680492],
       [-0.28750209],
       [-0.30393079],
       [-0.43536031],
       [-0.66536199],
       [ 0.30393079],
       [ 0.79679152]])

In [58]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
# df
df[['Protein (g)', 'Fiber (g)']] = scaler.fit_transform(df[['Protein (g)', 'Fiber (g)']])
df['Calories per 100g'] = scaler.fit_transform(df[['Calories per 100g']])
df


Unnamed: 0,Food,Taste,Dosha Balanced,Primary Benefit,Season,Potency,Calories per 100g,Protein (g),Fiber (g)
0,Ghee,Sweet,"Vata, Pitta","Digestion, Immunity",All,Cooling,2.374129,-0.846078,-0.893219
1,Honey,"Sweet, Astringent",Kapha,"Energy, Detoxification",Winter,Heating,0.076709,-0.796792,-0.84889
2,Rice,Sweet,"Pitta, Vata","Nourishment, Digestion",All,Cooling,-0.594014,-0.402503,-0.804562
3,Almonds,Sweet,"Vata, Pitta","Energy, Cognitive",Autumn,Heating,1.125196,2.636805,1.877311
4,Milk,Sweet,"Vata, Pitta","Nourishment, Calming",Winter,Cooling,-0.93323,-0.287502,-0.893219
5,Coconut,Sweet,"Pitta, Vata","Cooling, Nourishing",Summer,Cooling,0.269446,-0.303931,1.101562
6,Dates,Sweet,"Vata, Pitta","Energy, Blood-building",Winter,Heating,-0.008095,-0.43536,0.87992
7,Banana,Sweet,"Vata, Pitta","Energy, Nourishing",All,Cooling,-0.752058,-0.665362,-0.316949
8,Mung Beans,"Sweet, Astringent",All,Detoxification,Spring,Cooling,-0.690382,0.303931,0.791263
9,Yogurt,Sour,Vata,Digestion,Summer,Heating,-0.8677,0.796792,-0.893219


### <a name='8'></a> Data preprocessing for the model

In [59]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)

df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [71]:
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [72]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [73]:
# le = LabelEncoder()

# df['bought'] = le.fit_transform(df['bought'])

# scaler = StandardScaler()

# df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])

# df = pd.get_dummies(data=df, drop_first=True)

# df

# df.info()

le = LabelEncoder()
df['bought'] = le.fit_transform(df['bought'])

scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])

df = pd.get_dummies(data=df, drop_first=True, dtype=int)

df

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0


In [78]:
test_data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
}
test_df = pd.DataFrame(data=test_data)
test_df
test_df = pd.get_dummies(data=test_df, drop_first=True)
test_df
for col in ['size_M', 'size_XL', 'color_green', 'color_red', 'gender_male']:
  test_df[col] = test_df[col].astype('int')
test_df

Unnamed: 0,size_M,size_XL,color_green,color_red,gender_male
0,0,1,0,1,0
1,0,0,1,0,1
2,1,0,0,0,1
3,0,0,1,0,0
4,1,0,0,1,0
