## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
def data_preprocessing(data):
    #Identify numeric and categorical features
    numbers = data.select_dtypes(include=['float', 'int']).columns
    variables = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numbers] = data[numbers].fillna(data[numbers].mean())

    #Detect and handle outliers in numeric features using IQR
    for select in numbers:
        Q1 = data[select].quantile(0.25)
        Q3 = data[select].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - (1.5 * IQR)
        upper_limit = Q3 + (1.5 * IQR)
        data[select] = np.where((data[select] < lower_limit) | (data[select] > upper_limit),
                                 data[select].mean(), data[select])

    #Normalize numeric features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[numbers])
    data[numbers] = scaler.transform(data[numbers])

    #Handle missing values in categorical features
    data[variables] = data[variables].fillna(data[variables].mode().iloc[0])

    return data

In [3]:
data = pd.read_csv("pre_data.csv")

display("Original Data:")
display(data)

'Original Data:'

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,1.0,7,A
1,2.0,8,B
2,,9,
3,4.0,10,A
4,5.0,11,B
5,6.0,50,C


## Preprocessing steps

In [4]:
ready_to_use_data = data_preprocessing(data)

display("Preprocessed Data:")
display(ready_to_use_data)

'Preprocessed Data:'

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,-1.535624,-1.09937,A
1,-0.944999,-0.749128,B
2,0.0,-0.398886,A
3,0.23625,-0.048645,A
4,0.826874,0.301597,B
5,1.417499,1.994431,C
