In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy import stats


In [9]:
def createdata():
    data = {
        'Age': np.random.randint(18, 70, size=20),
        'Salary': np.random.randint(30000, 120000, size=20),
        'Purchased': np.random.choice([0, 1], size=20),
        'Gender': np.random.choice(['Male', 'Female'], size=20),
        'City': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)
    }
    return pd.DataFrame(data)

df = createdata()
df.head()


Unnamed: 0,Age,Salary,Purchased,Gender,City
0,68,106451,1,Female,San Francisco
1,19,35958,0,Female,New York
2,20,119589,0,Female,San Francisco
3,46,106389,0,Male,New York
4,35,81143,0,Female,New York


In [10]:
# Introduce missing values for demonstration
df.loc[5, 'Age'] = np.nan
df.loc[10, 'Salary'] = np.nan

# Check missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]


Unnamed: 0,0
Age,1
Salary,1


In [12]:
df_copy = df.copy()

imputer_age = SimpleImputer(strategy='median')
imputer_salary = SimpleImputer(strategy='mean')

df_copy[['Age']] = imputer_age.fit_transform(df_copy[['Age']])
df_copy[['Salary']] = imputer_salary.fit_transform(df_copy[['Salary']])

df_copy.isnull().sum()


Unnamed: 0,0
Age,0
Salary,0
Purchased,0
Gender,0
City,0


In [13]:
ordinal_encoder = OrdinalEncoder(categories=[['Male', 'Female']])
df_copy['Gender_Encoded'] = ordinal_encoder.fit_transform(df_copy[['Gender']])


In [15]:
onehot_encoder = OneHotEncoder(sparse_output=False)
city_encoded = onehot_encoder.fit_transform(df_copy[['City']])

city_df = pd.DataFrame(
    city_encoded,
    columns=onehot_encoder.get_feature_names_out(['City'])
)

df_encoded = pd.concat([df_copy, city_df], axis=1)
df_encoded.drop(['Gender', 'City'], axis=1, inplace=True)

df_encoded.head()

Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,68.0,106451.0,1,1.0,0.0,0.0,1.0
1,19.0,35958.0,0,1.0,0.0,1.0,0.0
2,20.0,119589.0,0,1.0,0.0,0.0,1.0
3,46.0,106389.0,0,0.0,0.0,1.0,0.0
4,35.0,81143.0,0,1.0,0.0,1.0,0.0


In [16]:
minmax_scaler = MinMaxScaler()
df_encoded[['Salary']] = minmax_scaler.fit_transform(df_encoded[['Salary']])
df_encoded.head()


Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,68.0,0.85073,1,1.0,0.0,0.0,1.0
1,19.0,0.04981,0,1.0,0.0,1.0,0.0
2,20.0,1.0,0,1.0,0.0,0.0,1.0
3,46.0,0.850026,0,0.0,0.0,1.0,0.0
4,35.0,0.563188,0,1.0,0.0,1.0,0.0


In [17]:
standard_scaler = StandardScaler()
df_encoded[['Age']] = standard_scaler.fit_transform(df_encoded[['Age']])
df_encoded.head()


Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,1.627021,0.85073,1,1.0,0.0,0.0,1.0
1,-1.433402,0.04981,0,1.0,0.0,1.0,0.0
2,-1.370945,1.0,0,1.0,0.0,0.0,1.0
3,0.252953,0.850026,0,0.0,0.0,1.0,0.0
4,-0.43408,0.563188,0,1.0,0.0,1.0,0.0
