In [1]:
# Data standarization, preparing the imported DF to the ML model.

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
# Creating a basic DF from the dictionary and creating a copy
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df = df_raw.copy()

In [4]:
# Changing of the type of columns Color, Size, Gender and bought and making weight as float

for col in ['size', 'color', 'gender', 'bought']:
  df[col] = df[col].astype('category')

df['weight'] = df['weight'].astype('float')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 740.0 bytes


In [5]:
# Gettin some more info about the DF (only for the number columns)
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [6]:
# Getting some informations including the category type
df.describe(include = ['category'])

Unnamed: 0,size,color,gender,bought
count,5,5,5,5
unique,3,3,2,2
top,L,green,female,yes
freq,2,2,3,3


In [7]:
# Label encoder - changing in the column bought the YES/NO for the 1/0 and
#changing the column Also to get back - inverse_transform
# to make things faster -> le.fit_transform(df['column name'])
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])
df['bought'] = le.transform(df['bought'])
df['bought'] = le.inverse_transform(df['bought'])

In [8]:
# Using the encoder for the column with more than 2 options (size) Array presents in which column is Size: L, M, XL
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output = False)
encoder.fit(df[['size']])
encoder.transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [9]:
# Dropping the column with size L (where we have not the bought anything)
encoder = OneHotEncoder(drop = 'first', sparse_output = False)
encoder.fit(df[['size']])
encoder.transform(df[['size']])

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [10]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


In [11]:
# Get Dummies allows is more elastic than OneHotEncoder which changes all variables into 0/1 with additional columns for particular value
# Which were in the previous columns

pd.get_dummies(data=df)

Unnamed: 0,price,weight,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male,bought_no,bought_yes
0,199.0,500.0,0,0,1,0,0,1,1,0,0,1
1,89.0,450.0,1,0,0,0,1,0,0,1,1,0
2,99.0,300.0,0,1,0,1,0,0,0,1,0,1
3,129.0,380.0,1,0,0,0,1,0,1,0,1,0
4,79.0,410.0,0,1,0,0,0,1,1,0,0,1


In [12]:
# Parametr drop first is deleting the data which is not needed into the data model (saving the memory)
# Example: The Size L was not bought anywhere thus
pd.get_dummies(data = df, drop_first = True, prefix = 'New')

Unnamed: 0,price,weight,New_M,New_XL,New_green,New_red,New_male,New_yes
0,199.0,500.0,0,1,0,1,0,1
1,89.0,450.0,0,0,1,0,1,0
2,99.0,300.0,1,0,0,0,1,1
3,129.0,380.0,0,0,1,0,0,0
4,79.0,410.0,1,0,0,1,0,1


In [13]:
# It is also possible to use get summies only on parcitular column in DF

pd.get_dummies(data = df, drop_first = True, columns = ['size'])

Unnamed: 0,color,gender,price,weight,bought,size_M,size_XL
0,red,female,199.0,500.0,yes,0,1
1,green,male,89.0,450.0,no,0,0
2,blue,male,99.0,300.0,yes,1,0
3,green,female,129.0,380.0,no,0,0
4,red,female,79.0,410.0,yes,1,0


In [14]:
# Calculating basic values for the int data

print(f"{df['price']}\n")
print(f"{df['price'].mean()}")
print(f"Odchylenie standardowe: {df['price'].std():.4f}")

0    199.0
1     89.0
2     99.0
3    129.0
4     79.0
Name: price, dtype: float64

119.0
Odchylenie standardowe: 48.4768


In [18]:
(df['price'] - df['price'].mean()) / df['price'].std()

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [20]:
# Same thing could be achieved with the function or function from sklearn (returns array)
def standarize(x):
  return(x - x.mean()) / x.std()

standarize(df['price'])

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [21]:
from sklearn.preprocessing import scale

scale(df['price'])

array([ 1.84506242, -0.69189841, -0.4612656 ,  0.2306328 , -0.92253121])

In [22]:
# Another alternative is StandardScaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[['price']])
scaler.transform(df[['price']])

array([[ 1.84506242],
       [-0.69189841],
       [-0.4612656 ],
       [ 0.2306328 ],
       [-0.92253121]])

In [24]:
# Adding the standard scaler for each column with INT values
scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.366002,yes
1,L,green,male,-0.691898,0.62361,no
2,M,blue,male,-0.461266,-1.603567,yes
3,L,green,female,0.230633,-0.41574,no
4,M,red,female,-0.922531,0.029696,yes


In [25]:
# Repeating the same actions once again
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [27]:
df['bought'] = le.fit_transform(df['bought'])
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
df = pd.get_dummies(data = df, drop_first = True)
df

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0
