In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Create a dataset with 10 rows, including missing values and outliers
data = {
    'Age': [25, 30, np.nan, 35, 40, 22, 100, 28, 32, np.nan], # 100 is outlier
    'Salary': [50000, 60000, 55000, 65000, np.nan, 45000, 120000, 52000, 58000, 62000],
    'City': ['NY', 'LA', 'NY', 'Chicago', 'LA', 'NY', 'Chicago', 'LA', 'NY', 'Chicago'],
    'Purchased': ['No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
     Age    Salary     City Purchased
0   25.0   50000.0       NY        No
1   30.0   60000.0       LA       Yes
2    NaN   55000.0       NY        No
3   35.0   65000.0  Chicago       Yes
4   40.0       NaN       LA       Yes
5   22.0   45000.0       NY        No
6  100.0  120000.0  Chicago       Yes
7   28.0   52000.0       LA        No
8   32.0   58000.0       NY       Yes
9    NaN   62000.0  Chicago       Yes


In [2]:
# Handling Missing Values
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])
print("\nAfter Imputation:")
print(df)


After Imputation:
     Age    Salary     City Purchased
0   25.0   50000.0       NY        No
1   30.0   60000.0       LA       Yes
2   39.0   55000.0       NY        No
3   35.0   65000.0  Chicago       Yes
4   40.0   63000.0       LA       Yes
5   22.0   45000.0       NY        No
6  100.0  120000.0  Chicago       Yes
7   28.0   52000.0       LA        No
8   32.0   58000.0       NY       Yes
9   39.0   62000.0  Chicago       Yes


In [3]:
# Outlier Detection (Z-Score)
from scipy import stats
z_scores = np.abs(stats.zscore(df[['Age', 'Salary']]))
print("\nZ-Scores:")
print(z_scores)
# Filter out outliers (Z > 2 for this small dataset)
df_clean = df[(z_scores < 2).all(axis=1)]
print("\nData after removing outliers:")
print(df_clean)


Z-Scores:
[[0.6618812  0.65278024]
 [0.42549506 0.15064159]
 [0.         0.40171091]
 [0.18910891 0.10042773]
 [0.04727723 0.        ]
 [0.80371289 0.90384956]
 [2.88391094 2.86219027]
 [0.52004951 0.55235251]
 [0.3309406  0.25106932]
 [0.         0.05021386]]

Data after removing outliers:
    Age   Salary     City Purchased
0  25.0  50000.0       NY        No
1  30.0  60000.0       LA       Yes
2  39.0  55000.0       NY        No
3  35.0  65000.0  Chicago       Yes
4  40.0  63000.0       LA       Yes
5  22.0  45000.0       NY        No
7  28.0  52000.0       LA        No
8  32.0  58000.0       NY       Yes
9  39.0  62000.0  Chicago       Yes


In [4]:
# Encoding Categorical Variables
df_encoded = pd.get_dummies(df, columns=['City'], drop_first=True)
print("\nAfter One-Hot Encoding:")
print(df_encoded)


After One-Hot Encoding:
     Age    Salary Purchased  City_LA  City_NY
0   25.0   50000.0        No    False     True
1   30.0   60000.0       Yes     True    False
2   39.0   55000.0        No    False     True
3   35.0   65000.0       Yes    False    False
4   40.0   63000.0       Yes     True    False
5   22.0   45000.0        No    False     True
6  100.0  120000.0       Yes    False    False
7   28.0   52000.0        No     True    False
8   32.0   58000.0       Yes    False     True
9   39.0   62000.0       Yes    False    False


In [5]:
# Scaling
scaler = StandardScaler()
df_encoded[['Age', 'Salary']] = scaler.fit_transform(df_encoded[['Age', 'Salary']])
print("\nAfter Scaling:")
print(df_encoded)


After Scaling:
        Age    Salary Purchased  City_LA  City_NY
0 -0.661881 -0.652780        No    False     True
1 -0.425495 -0.150642       Yes     True    False
2  0.000000 -0.401711        No    False     True
3 -0.189109  0.100428       Yes    False    False
4  0.047277  0.000000       Yes     True    False
5 -0.803713 -0.903850        No    False     True
6  2.883911  2.862190       Yes    False    False
7 -0.520050 -0.552353        No     True    False
8 -0.330941 -0.251069       Yes    False     True
9  0.000000 -0.050214       Yes    False    False
