In [21]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)

In [22]:
df = pd.read_csv("cleaned_df.csv")
df.head(2)

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Purchase Price,Current Market Price
0,2019,Acura,Other Model Type,SUV,AWD,Black,White,Premium Unleaded,Automatic,4,4.0,9.92,33032,10.1,5,40888,7126.32
1,2022,Acura,Other Model Type,SUV,AWD,Red,Black,Gas,Automatic,4,,11.16,31000,9.0,2,60899,13368.07


In [23]:
for col in df.select_dtypes(include='object'):
    unique_count = df[col].nunique()
    print(f'The number of unique values in the "{col}" column is: {unique_count}')
    print(df[col].value_counts())
    print("=================================================================================")

The number of unique values in the "Brand" column is: 21
Brand
Hyundai          1074
Ford             1068
Honda            1007
BMW               899
Volkswagen        776
Other Brand       734
Audi              709
Nissan            671
Mazda             626
Chevrolet         608
Mercedes-Benz     535
Jeep              482
Kia               402
Dodge             386
Porsche           317
GMC               295
Infiniti          252
Acura             245
Subaru            222
Cadillac          198
Chrysler           99
Name: count, dtype: int64
The number of unique values in the "Model" column is: 21
Model
Other Model Type    7186
Civic                397
Elantra              344
F-150                311
Grand                294
Santa                248
Q5                   239
Jetta                230
X5                   212
CR-V                 211
Mazda3               209
Silverado            205
Rogue                198
Sierra               189
Tucson               189
Escape     

## Label Encoding Multi-Class Columns using Label Encoder:

In [24]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
non_numeric_columns_list = non_numeric_columns.tolist()
print(non_numeric_columns_list)

['Brand', 'Model', 'Body Type', 'Drivetrain', 'Interior Colour', 'Exterior Colour', 'Fuel Type', 'Transmission Type']


In [25]:
# Create a dictionary to store the mappings
mappings = {}

for col in non_numeric_columns_list:

    le.fit(df[col])
    # Store the original and encoded mappings
    mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
    # Transform the column and assign it back to the DataFrame
    df[col] = le.transform(df[col])

# Function to reverse the encoding for a specific column
def reverse_mapping(col, value):
    return {v: k for k, v in mappings[col].items()}[value]


In [26]:
df

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Purchase Price,Current Market Price
0,2019,0,10,7,3,1,5,6,0,4,4.0,9.92,33032,10.1,5,40888,7126.32
1,2022,0,10,7,3,7,0,2,0,4,,11.16,31000,9.0,2,60899,13368.07
2,2020,0,10,7,3,1,0,6,0,4,4.0,10.31,27800,9.7,4,49099,10391.32
3,2019,0,10,8,3,8,0,6,0,4,6.0,9.74,34396,10.3,5,36499,6165.93
4,2020,0,10,7,3,1,0,2,0,4,,9.92,60892,10.1,4,38495,4344.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11600,2011,17,10,10,4,6,5,2,0,4,5.0,8.53,206835,11.7,13,11495,185.54
11601,2021,17,10,7,3,1,5,2,0,4,4.0,9.35,36519,10.7,3,46980,8584.23
11602,2022,17,10,10,3,6,5,2,0,4,4.0,9.59,8460,10.4,2,64980,31680.39
11603,2023,17,10,7,3,3,5,4,0,4,4.0,8.50,50,11.8,1,82479,77965.22


## Imputing Unknown values for the columns Cylinder Count having missing value % between 20-40 using KNN Imputer:

In [18]:
imputed_data = knn_imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)
imputed_df['Cylinder Count'] = round(imputed_df['Cylinder Count'])
imputed_df.head()

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Purchase Price,Current Market Price
0,2019.0,0.0,10.0,7.0,3.0,1.0,5.0,6.0,0.0,4.0,4.0,9.92,33032.0,10.1,5.0,40888.0,7126.32
1,2022.0,0.0,10.0,7.0,3.0,7.0,0.0,2.0,0.0,4.0,6.0,11.16,31000.0,9.0,2.0,60899.0,13368.07
2,2020.0,0.0,10.0,7.0,3.0,1.0,0.0,6.0,0.0,4.0,4.0,10.31,27800.0,9.7,4.0,49099.0,10391.32
3,2019.0,0.0,10.0,8.0,3.0,8.0,0.0,6.0,0.0,4.0,6.0,9.74,34396.0,10.3,5.0,36499.0,6165.93
4,2020.0,0.0,10.0,7.0,3.0,1.0,0.0,2.0,0.0,4.0,4.0,9.92,60892.0,10.1,4.0,38495.0,4344.07


In [19]:
imputed_df.to_csv("transformed_df.csv",index=False)

In [42]:
# Apply the reverse mapping to the entire DataFrame
for col in non_numeric_columns_list:
    imputed_df[col] = imputed_df[col].apply(lambda x: reverse_mapping(col, x))


In [35]:
imputed_df.isna().sum()

Make                              0
Model                             0
Body Type                        28
Drivetrain                        0
Interior Colour                2508
Exterior Colour                   0
Fuel Type                         0
Transmission Type                 0
Doors                             0
Cylinder Count                    0
Fuel Efficiency (Lt/100kms)       0
Distance (km)                     0
Mileage (kms/Lt)                  0
Years Used                        0
Purchase Price                    0
Current Market Price              0
dtype: int64