# Task 4: Combine Multiple Preprocessing Techniques

## Step 1: Import the libraries

In [30]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.mstats import winsorize

## Step 2: Load the Iris dataset

In [33]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

## Step 3: Handle Missing Values (simulating missing values)

In [36]:
# Simulate some missing values for demonstration
df.loc[::20, 'sepal length (cm)'] = np.nan
df.loc[::15, 'petal width (cm)'] = np.nan

# Fill missing values for numerical columns with mean
numeric_columns = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
for col in numeric_columns:
    df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


## Step 4: Detect and Handle Outliers (simulating outliers)

In [39]:
# Simulate outliers for demonstration
df.loc[df['sepal length (cm)'] > 7.5, 'sepal length (cm)'] *= 1.5  # Creating outliers

# Apply winsorization to cap extreme values in 'sepal length (cm)'
df['sepal length (cm)'] = winsorize(df['sepal length (cm)'], limits=[0.05, 0.05])

## Step 5: Normalize/Transform the Data

In [42]:
# Min-Max scaling of numerical columns
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Log transformation of 'petal length (cm)' to handle skewness
df['log_petal_length'] = np.log(df['petal length (cm)'] + 1)  # Adding 1 to handle zeros

## Step 6: Displaying Results

In [45]:
# Optionally, display or save the cleaned dataset
print(df.head())

# Save cleaned dataset to CSV
df.to_csv('cleaned_iris_dataset.csv', index=False)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0           0.462702          0.625000           0.067797          0.459821   
1           0.111111          0.416667           0.067797          0.041667   
2           0.037037          0.500000           0.050847          0.041667   
3           0.000000          0.458333           0.084746          0.041667   
4           0.148148          0.666667           0.067797          0.041667   

   target  log_petal_length  
0       0          0.065597  
1       0          0.065597  
2       0          0.049597  
3       0          0.081346  
4       0          0.065597  


## Step 7: Documentation