In [1]:
import numpy as np
import pandas as pd

In [5]:
data = {
'Age': [18, 19, np.nan, 20, 21, np.nan,23,24,25,26],
'Marks': [85, np.nan, 78, 90, np.nan, 88,72,86,np.nan,62],
'Attendance': [92, 85, 88, np.nan, 90, np.nan,75,65,80,85],
'Gender': ['Male', 'Female', 'Male', np.nan, 'Female', 'Male','Male',np.nan,'Female','Male']
}


df = pd.DataFrame(data)
df

Unnamed: 0,Age,Marks,Attendance,Gender
0,18.0,85.0,92.0,Male
1,19.0,,85.0,Female
2,,78.0,88.0,Male
3,20.0,90.0,,
4,21.0,,90.0,Female
5,,88.0,,Male
6,23.0,72.0,75.0,Male
7,24.0,86.0,65.0,
8,25.0,,80.0,Female
9,26.0,62.0,85.0,Male


## Handling Missing Values

In [6]:
df.isnull().sum()

Age           2
Marks         3
Attendance    2
Gender        2
dtype: int64

### Simple Imputer

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
df

Unnamed: 0,Age,Marks,Attendance,Gender
0,18.0,85.0,92.0,Male
1,19.0,,85.0,Female
2,,78.0,88.0,Male
3,20.0,90.0,,
4,21.0,,90.0,Female
5,,88.0,,Male
6,23.0,72.0,75.0,Male
7,24.0,86.0,65.0,
8,25.0,,80.0,Female
9,26.0,62.0,85.0,Male


In [12]:
num_cols = ['Age','Marks','Attendance']

num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df

Unnamed: 0,Age,Marks,Attendance,Gender
0,18.0,85.0,92.0,Male
1,19.0,80.142857,85.0,Female
2,22.0,78.0,88.0,Male
3,20.0,90.0,82.5,
4,21.0,80.142857,90.0,Female
5,22.0,88.0,82.5,Male
6,23.0,72.0,75.0,Male
7,24.0,86.0,65.0,
8,25.0,80.142857,80.0,Female
9,26.0,62.0,85.0,Male


In [14]:
cat_imputer = SimpleImputer(strategy='most_frequent')
df[['Gender']] = cat_imputer.fit_transform(df[['Gender']])
df

Unnamed: 0,Age,Marks,Attendance,Gender
0,18.0,85.0,92.0,Male
1,19.0,80.142857,85.0,Female
2,22.0,78.0,88.0,Male
3,20.0,90.0,82.5,Male
4,21.0,80.142857,90.0,Female
5,22.0,88.0,82.5,Male
6,23.0,72.0,75.0,Male
7,24.0,86.0,65.0,Male
8,25.0,80.142857,80.0,Female
9,26.0,62.0,85.0,Male


## Scaler

In [16]:
df = pd.DataFrame({
'Age': [18, 19, 20, 21, 22, 23, 24, 25],
'Salary': [15000, 22000, 35000, 48000, 60000, 50000, 42000, 28000],
'Marks': [65, 70, 80, 85, 90, 74, 80, 88]
})

df

Unnamed: 0,Age,Salary,Marks
0,18,15000,65
1,19,22000,70
2,20,35000,80
3,21,48000,85
4,22,60000,90
5,23,50000,74
6,24,42000,80
7,25,28000,88


### Standard Scaler
Mean = 0, Standard Deviation = 1

In [15]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

df_standard = df.copy()
df_standard[['Age', 'Salary', 'Marks']] = scaler.fit_transform(
df[['Age', 'Salary', 'Marks']]
)

df_standard

Unnamed: 0,Age,Salary,Marks
0,-1.527525,-1.573388,-1.700879
1,-1.091089,-1.08389,-1.093422
2,-0.654654,-0.174821,0.121491
3,-0.218218,0.734248,0.728948
4,0.218218,1.573388,1.336405
5,0.654654,0.874105,-0.607457
6,1.091089,0.314678,0.121491
7,1.527525,-0.664319,1.093422


### Min Max Scaler
All values between 0 and 1

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
minmax = MinMaxScaler()

df_minmax = df.copy()
df_minmax[['Age', 'Salary', 'Marks']] = minmax.fit_transform(
df[['Age', 'Salary', 'Marks']]
)

df_minmax

Unnamed: 0,Age,Salary,Marks
0,0.0,0.0,0.0
1,0.142857,0.155556,0.2
2,0.285714,0.444444,0.6
3,0.428571,0.733333,0.8
4,0.571429,1.0,1.0
5,0.714286,0.777778,0.36
6,0.857143,0.6,0.6
7,1.0,0.288889,0.92


## Normalization

In [21]:
from sklearn.preprocessing import Normalizer

In [22]:
normalizer = Normalizer(norm='l2')

df_normalized = df.copy()
df_normalized[['Age', 'Salary', 'Marks']] = normalizer.fit_transform(df[['Age', 'Salary', 'Marks']])

df_normalized

Unnamed: 0,Age,Salary,Marks
0,0.0012,0.99999,0.004333
1,0.000864,0.999995,0.003182
2,0.000571,0.999997,0.002286
3,0.000437,0.999998,0.001771
4,0.000367,0.999999,0.0015
5,0.00046,0.999999,0.00148
6,0.000571,0.999998,0.001905
7,0.000893,0.999995,0.003143


## Encoding categorical features

In [27]:
df = pd.DataFrame({
'Student_ID': [1, 2, 3, 4, 5],
'Education_Level': ['High School', 'Bachelor', 'Master', 'PhD', 'Bachelor'],
'Department': ['CS', 'IT', 'CS', 'Mechanical', 'IT']
})

df

Unnamed: 0,Student_ID,Education_Level,Department
0,1,High School,CS
1,2,Bachelor,IT
2,3,Master,CS
3,4,PhD,Mechanical
4,5,Bachelor,IT


### Ordinal Encoding
Used when categories have an order / ranking  
Only use when order matters

In [26]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    categories=[['High School', 'Bachelor', 'Master', 'PhD']]
)

df_ordinal = df.copy()
df_ordinal[['Education_Level']] = ordinal_encoder.fit_transform(
    df[['Education_Level']]
)

df_ordinal

Unnamed: 0,Student_ID,Education_Level,Department
0,1,0.0,CS
1,2,1.0,IT
2,3,2.0,CS
3,4,3.0,Mechanical
4,5,1.0,IT


### One Hot Encoding
Used when NO natural order exists.

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [32]:
encoded = onehot.fit_transform(df[['Department']]).toarray()

encoded_df = pd.DataFrame(
    encoded,
    columns=onehot.get_feature_names_out(['Department'])
)

encoded_df

Unnamed: 0,Department_CS,Department_IT,Department_Mechanical
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
