In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing_pipeline(data):
    #Firstly identifing numeric and categorical features form the dataset
    numeric_features=data.select_dtypes(include=['float','int']).columns
    categorical_features=data.select_dtypes(include=['object']).columns
    #Handle missing values in numeric features
    data[numeric_features]=data[numeric_features].fillna(data[numeric_features].mean())
    #Detecting and handling outliers in numeric features using  Interquartile Range (IQR) method.
    for feature in numeric_features:
        Q1=data[feature].quantile(0.25)
        Q2=data[feature].quantile(0.75)
        IQR=Q2-Q1
        lower_bound=Q1-(1.5*IQR)
        upper_bound=Q2+(1.5*IQR)
        data[feature]=np.where((data[feature]<lower_bound)|(data[feature]>upper_bound),data[feature].mean(),data[feature])
        
        #Normalizing the  numeric features
        scaler=StandardScaler()
        scaled_data=scaler.fit_transform(data[numeric_features])
        data[numeric_features]=scaler.transform(data[numeric_features])
        
        #Handling missing values in categorical features
        data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])
        
        
        return data
    


In [2]:
data=pd.read_csv("data.csv")
print("Original Data:",data)

Original Data:    NumericFeature1  NumericFeature2 CategoricalFeature
0              1.0                7                  A
1              2.0                8                  B
2              NaN                9                NaN
3              4.0               10                  A
4              5.0               11                  B
5              6.0               50                  C
6              7.0               40                  D
7              8.1               20                NaN
8              9.0                0                  B


In [3]:
# After performance of preprocessing
cleaned_data=data_preprocessing_pipeline(data)
print("Preprocessed Data:",cleaned_data)

Preprocessed Data:    NumericFeature1  NumericFeature2 CategoricalFeature
0    -1.707906e+00        -0.647150                  A
1    -1.307224e+00        -0.583841                  B
2     3.558769e-16        -0.520533                  B
3    -5.058607e-01        -0.457225                  A
4    -1.051790e-01        -0.393917                  B
5     2.955028e-01         2.075099                  C
6     6.961845e-01         1.442018                  D
7     1.136934e+00         0.175856                  B
8     1.497548e+00        -1.090306                  B
