## 5. Data Imputation in the Retail Sales Dataset
   - Task: Handle missing values in the Retail Sales dataset using advanced imputation techniques like KNN imputation and MICE.
   - Dataset: Retail Sales Dataset


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/big-mart-sales/train_v9rqX0R.csv')
df.head()


In [None]:
missing_values = df.isnull().sum()
print(missing_values)


In [None]:
df.drop_duplicates(inplace=True)


In [None]:
df.drop(columns=['Item_Identifier'],inplace=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def detect_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] < lower_bound) | (df[col] > upper_bound)]

numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']
for col in numerical_cols:
    outliers = detect_outliers(df, col)
    print(f"{col} has {len(outliers)} outliers")

# Handling outliers can vary, for simplicity we may cap them
def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

for col in numerical_cols:
    cap_outliers(df, col)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [None]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})
df = pd.get_dummies(df, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])


## KNN Imputer

In [None]:
# KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
# Include all columns for imputation
df_knn_imputed = df.copy()
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn_imputed), columns=df.columns)


## MICE Imputer

In [None]:
# MICE Imputer

mice_imputer = IterativeImputer(max_iter=10, random_state=0)
# Include all columns for imputation
df_mice_imputed = df.copy()
df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice_imputed), columns=df.columns)


In [None]:
df_knn_imputed.head()


In [None]:
df_mice_imputed.head()
