# **Data Preprocessing : Handling Missing Values**


In [2]:
import pandas as pd
import numpy as np

data = {
    'A': [1,2,None,4],
    'B': [None,2,3,4]
}

df = pd.DataFrame(data)

df['A'].fillna(df['A'].mean(), inplace=True)
df['B'].fillna(df['B'].mean(), inplace=True)

print('Data Frame after Handling missing values')
print(df)

Data Frame after Handling missing values
          A    B
0  1.000000  3.0
1  2.000000  2.0
2  2.333333  3.0
3  4.000000  4.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['A'].fillna(df['A'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['B'].fillna(df['B'].mean(), inplace=True)


# **Data Preprocessing : Handling Outliers**


In [5]:
import pandas as pd

data = {
    'A': [1,2,None,4]
}

df = pd.DataFrame(data)
print(df)

Q1 = df['A'].quantile(0.25)
Q3 = df['A'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_no_outliers = df[(df['A']>= lower_bound ) & (df['A']<= upper_bound)]

print("DataFrame after handling outliers:")
print(df_no_outliers)

     A
0  1.0
1  2.0
2  NaN
3  4.0
DataFrame after handling outliers:
     A
0  1.0
1  2.0
3  4.0


# **Data Preprocessing : Normalization**


In [6]:
#Normalization

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

data = {
    'A': [1,2,3,4],
    'B': [10,20,30,40]
}

df = pd.DataFrame(data)

scaler = MinMaxScaler()
df_normalized = scaler.fit_transform(df)

df_normalized = pd.DataFrame(df_normalized, columns=df.columns)

print("DataFrame after normalization:")
print(df_normalized)

DataFrame after normalization:
          A         B
0  0.000000  0.000000
1  0.333333  0.333333
2  0.666667  0.666667
3  1.000000  1.000000


# **Data Preprocessing : Scaling**


In [7]:
#Scaling

import pandas as pd
from sklearn.preprocessing import StandardScaler

data = {
    'A': [1,2,3,4],
    'B': [10,20,30,40]
}

df = pd.DataFrame(data)

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

print("DataFrame after scaling:")
print(df_scaled)


DataFrame after scaling:
          A         B
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641
