In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Sample data
data = {
    'Age': [25, 30, np.nan, 35, 40],
    'Salary': [50000, 54000,  np.nan, 60000, 62000],
    'Country': ['USA', 'Canada',np.nan, 'USA', 'Canada'],
    'Purchased': ['No', 'Yes', 'No', 'Yes', np.nan]
}

In [4]:
d = pd.DataFrame(data)
print("Original DataFrame:\n", d)

Original DataFrame:
     Age   Salary Country Purchased
0  25.0  50000.0     USA        No
1  30.0  54000.0  Canada       Yes
2   NaN      NaN     NaN        No
3  35.0  60000.0     USA       Yes
4  40.0  62000.0  Canada       NaN


In [5]:
# Handling missing values
# Impute missing values for numerical columns with the mean
imputer = SimpleImputer(strategy='mean')
d[['Age', 'Salary']] = imputer.fit_transform(d[['Age', 'Salary']])


In [6]:
# Impute missing values for categorical columns with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
d[['Country', 'Purchased']] = imputer.fit_transform(d[['Country', 'Purchased']])


In [7]:
print("\nDataFrame after handling missing values:\n", d)


DataFrame after handling missing values:
     Age   Salary Country Purchased
0  25.0  50000.0     USA        No
1  30.0  54000.0  Canada       Yes
2  32.5  56500.0  Canada        No
3  35.0  60000.0     USA       Yes
4  40.0  62000.0  Canada        No


In [8]:
# Handling categorical data
# Label Encoding for the 'Purchased' column
label_encoder = LabelEncoder()
d['Purchased'] = label_encoder.fit_transform(d['Purchased'])

In [9]:
# One-Hot Encoding for the 'Country' column
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
country_encoded = one_hot_encoder.fit_transform(d[['Country']])




In [10]:
# Create a DataFrame from the one-hot encoded columns and concatenate with the original DataFrame
country_df = pd.DataFrame(country_encoded, columns=one_hot_encoder.get_feature_names_out(['Country']))
df = pd.concat([d, country_df], axis=1).drop(['Country'], axis=1)


In [11]:
print("\nDataFrame after handling categorical data:\n", df)



DataFrame after handling categorical data:
     Age   Salary  Purchased  Country_USA
0  25.0  50000.0          0          1.0
1  30.0  54000.0          1          0.0
2  32.5  56500.0          0          0.0
3  35.0  60000.0          1          1.0
4  40.0  62000.0          0          0.0
