<a href="https://colab.research.google.com/github/KarolinaK-14/ML/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Library page: [https://scikit-learn.org](https://scikit-learn.org)

Documentation/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

The core library for machine learning in Python.

To install the library, use the command below:
```
!pip install scikit-learn
```
To update the library to the latest version, use the command below:
```
!pip install --upgrade scikit-learn
```

### Data preprocessing:
1. [Importing libraries](#0)
2. [Generating data](#1)
3. [Creating a copy of the data](#2)
4. [Changing data types and initial exploration](#3)
5. [LabelEncoder](#4)
6. [OneHotEncoder](#5)
7. [Pandas *get dummies()*](#6)
8. [Standardization - StandarScaler](#7)
9. [Data preprocessing for the model](#8)


### <a name='0'></a> Importing libraries

In [2]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.6.1'

### <a name='1'></a> Generating data

In [3]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


### <a name='2'></a> Creating a copy of the data

In [4]:
df = df_raw.copy()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 372.0+ bytes


### <a name='3'></a> Changing data types and initial esploration

In [5]:
for col in ['size', 'color', 'gender', 'bought']:
  df[col] = df[col].astype('category')

df['weight'] = df['weight'].astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 744.0 bytes


In [6]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,5.0,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,408.0,75.299402,300.0,380.0,410.0,450.0,500.0


In [8]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
size,5,3,L,2
color,5,3,green,2
gender,5,2,female,3
bought,5,2,yes,3


In [9]:
df.describe(include=['category','float']).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
size,5.0,3.0,L,2.0,,,,,,,
color,5.0,3.0,green,2.0,,,,,,,
gender,5.0,2.0,female,3.0,,,,,,,
price,5.0,,,,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,,,,408.0,75.299402,300.0,380.0,410.0,450.0,500.0
bought,5.0,2.0,yes,3.0,,,,,,,


In [10]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


### <a name='4'></a> LabelEncoder

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])

array([1, 0, 1, 0, 1])

In [12]:
le.fit_transform(df['bought'])

array([1, 0, 1, 0, 1])

In [13]:
le.classes_

array(['no', 'yes'], dtype=object)

In [14]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,380.0,0
4,M,red,female,79.0,410.0,1


In [15]:
le.inverse_transform(df['bought'])

array(['yes', 'no', 'yes', 'no', 'yes'], dtype=object)

In [16]:
df['bought'] = le.inverse_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


### <a name='6'></a> OneHotEncoder

In [17]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(df[['size']])

In [18]:
encoder.transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [19]:
encoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

In [20]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoder.fit_transform(df[['size']])

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [21]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


### <a name='6'></a> Pandas *get dummies()*

In [22]:
pd.get_dummies(data=df)

Unnamed: 0,price,weight,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male,bought_no,bought_yes
0,199.0,500.0,False,False,True,False,False,True,True,False,False,True
1,89.0,450.0,True,False,False,False,True,False,False,True,True,False
2,99.0,300.0,False,True,False,True,False,False,False,True,False,True
3,129.0,380.0,True,False,False,False,True,False,True,False,True,False
4,79.0,410.0,False,True,False,False,False,True,True,False,False,True


In [23]:
pd.get_dummies(data=df, drop_first=True)

Unnamed: 0,price,weight,size_M,size_XL,color_green,color_red,gender_male,bought_yes
0,199.0,500.0,False,True,False,True,False,True
1,89.0,450.0,False,False,True,False,True,False
2,99.0,300.0,True,False,False,False,True,True
3,129.0,380.0,False,False,True,False,False,False
4,79.0,410.0,True,False,False,True,False,True


In [24]:
pd.get_dummies(data=df, drop_first=True, prefix='funny')

Unnamed: 0,price,weight,funny_M,funny_XL,funny_green,funny_red,funny_male,funny_yes
0,199.0,500.0,False,True,False,True,False,True
1,89.0,450.0,False,False,True,False,True,False
2,99.0,300.0,True,False,False,False,True,True
3,129.0,380.0,False,False,True,False,False,False
4,79.0,410.0,True,False,False,True,False,True


In [25]:
pd.get_dummies(data=df, drop_first=True, prefix='funny', prefix_sep='~')

Unnamed: 0,price,weight,funny~M,funny~XL,funny~green,funny~red,funny~male,funny~yes
0,199.0,500.0,False,True,False,True,False,True
1,89.0,450.0,False,False,True,False,True,False
2,99.0,300.0,True,False,False,False,True,True
3,129.0,380.0,False,False,True,False,False,False
4,79.0,410.0,True,False,False,True,False,True


In [26]:
pd.get_dummies(data=df, drop_first=True, prefix='funny', prefix_sep='~', columns=['size', 'color'])

Unnamed: 0,gender,price,weight,bought,funny~M,funny~XL,funny~green,funny~red
0,female,199.0,500.0,yes,False,True,False,True
1,male,89.0,450.0,no,False,False,True,False
2,male,99.0,300.0,yes,True,False,False,False
3,female,129.0,380.0,no,False,False,True,False
4,female,79.0,410.0,yes,True,False,False,True


### <a name='7'></a> Standardization - StandardScale

std() - pandas unbiased  
std() - numpy biased

In [27]:
print(f"{df['price']}\n")
print(f"{df['price'] - df['price'].mean()}\n")
print(f"{df['price'] / df['price'].std()}\n")

0    199.0
1     89.0
2     99.0
3    129.0
4     79.0
Name: price, dtype: float64

0    80.0
1   -30.0
2   -20.0
3    10.0
4   -40.0
Name: price, dtype: float64

0    4.105057
1    1.835930
2    2.042214
3    2.661067
4    1.629646
Name: price, dtype: float64



In [28]:
(df['price'] - df['price'].mean()) / df['price'].std()

Unnamed: 0,price
0,1.650274
1,-0.618853
2,-0.412568
3,0.206284
4,-0.825137


In [29]:
def standardize(x):
  return (x - x.mean()) / x.std()

standardize(df['price'])

Unnamed: 0,price
0,1.650274
1,-0.618853
2,-0.412568
3,0.206284
4,-0.825137


In [30]:
from sklearn.preprocessing import scale

scale(df['price'])


array([ 1.84506242, -0.69189841, -0.4612656 ,  0.2306328 , -0.92253121])

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
df


Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.366002,yes
1,L,green,male,-0.691898,0.62361,no
2,M,blue,male,-0.461266,-1.603567,yes
3,L,green,female,0.230633,-0.41574,no
4,M,red,female,-0.922531,0.029696,yes


### <a name='8'></a> Data preprocessing for the model

In [37]:
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [41]:
le = LabelEncoder()

df['bought'] = le.fit_transform(df['bought'])

scaler = StandardScaler()

df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])

df = pd.get_dummies(data=df, drop_first=True)

df

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        5 non-null      float64
 1   weight       5 non-null      float64
 2   bought       5 non-null      int64  
 3   size_M       5 non-null      bool   
 4   size_XL      5 non-null      bool   
 5   color_green  5 non-null      bool   
 6   color_red    5 non-null      bool   
 7   gender_male  5 non-null      bool   
dtypes: bool(5), float64(2), int64(1)
memory usage: 277.0 bytes


In [44]:
df = pd.get_dummies(data=df, drop_first=True)

for col in ['size_M', 'size_XL', 'color_green', 'color_red', 'gender_male']:
  df[col] = df[col].astype('int')

df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        5 non-null      float64
 1   weight       5 non-null      float64
 2   bought       5 non-null      int64  
 3   size_M       5 non-null      int64  
 4   size_XL      5 non-null      int64  
 5   color_green  5 non-null      int64  
 6   color_red    5 non-null      int64  
 7   gender_male  5 non-null      int64  
dtypes: float64(2), int64(6)
memory usage: 452.0 bytes


Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0
