In [60]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)

In [61]:
df = pd.read_csv("cleaned_df.csv")
df.head(2)

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Price
0,2019,Acura,Other Model Type,SUV,AWD,Red,Black,Gas,Automatic,,6.0,10.87,53052,9.2,5,43880
1,2018,Acura,Other Model Type,SUV,AWD,Black,Other Colour,Gas,Automatic,,6.0,10.98,77127,9.1,6,36486


In [62]:
for col in df.select_dtypes(include='object'):
    unique_count = df[col].nunique()
    print(f'The number of unique values in the "{col}" column is: {unique_count}')
    print(df[col].value_counts())
    print("=================================================================================")

The number of unique values in the "Brand" column is: 21
Brand
Honda            1283
Ford             1231
Hyundai          1223
BMW              1099
Nissan            952
Other Brand       883
Volkswagen        866
Jeep              866
Audi              783
Mazda             747
Mercedes-Benz     723
Chevrolet         713
Dodge             500
Kia               497
Porsche           431
Infiniti          393
GMC               347
Subaru            299
Acura             295
Cadillac          216
Chrysler          155
Name: count, dtype: int64
The number of unique values in the "Model" column is: 21
Model
Other Model Type    9001
Civic                516
Grand                424
Elantra              381
F-150                362
Wrangler             291
Santa                287
Rogue                284
X5                   272
CR-V                 267
Q5                   265
Jetta                262
Mazda3               247
Silverado            245
Sierra               221
Tucson     

## Dataset Transformation:

### 1. Converting values in all the categorical columns to lowercase:

In [63]:
def convert_columns_to_lowercase(df):
    object_columns = df.select_dtypes(include='object').columns
    for col in object_columns:
        df[col] = df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
    return df

df = convert_columns_to_lowercase(df)
df.head()


Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Price
0,2019,acura,other model type,suv,awd,red,black,gas,automatic,,6.0,10.87,53052,9.2,5,43880
1,2018,acura,other model type,suv,awd,black,other colour,gas,automatic,,6.0,10.98,77127,9.1,6,36486
2,2019,acura,other model type,suv,awd,black,white,premium unleaded,automatic,4.0,4.0,9.92,33032,10.1,5,40888
3,2020,acura,other model type,suv,awd,black,white,gas,other,,4.0,9.92,50702,10.1,4,44599
4,2021,acura,other model type,suv,awd,red,blue,gas,other,,4.0,10.31,67950,9.7,3,46989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14497,2011,other brand,other model type,wagon,fwd,other colour,white,gas,automatic,4.0,5.0,8.53,206835,11.7,13,11495
14498,2021,other brand,other model type,suv,awd,black,white,gas,automatic,4.0,4.0,9.35,36519,10.7,3,46980
14499,2022,other brand,other model type,wagon,awd,other colour,white,gas,automatic,4.0,4.0,9.59,8460,10.4,2,64980
14500,2023,other brand,other model type,suv,awd,charcoal,white,gasoline hybrid,automatic,4.0,4.0,8.50,50,11.8,1,82479


### 2. Encoding Multi-Class Categorical Columns using Label Encoder:

In [64]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
non_numeric_columns_list = non_numeric_columns.tolist()
print(non_numeric_columns_list)

['Brand', 'Model', 'Body Type', 'Drivetrain', 'Interior Colour', 'Exterior Colour', 'Fuel Type', 'Transmission Type']


In [65]:
# Create a dictionary to store the mappings
mappings = {}

for col in non_numeric_columns_list:

    le.fit(df[col])
    # Store the original and encoded mappings
    mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
    # Transform the column and assign it back to the DataFrame
    df[col] = le.transform(df[col])

# Function to reverse the encoding for a specific column
def reverse_mapping(col, value):
    return {v: k for k, v in mappings[col].items()}[value]


### 3. Imputing Unknown values for the columns 'Cylinder Count', 'Drivetran' & 'Doors' having missing value % between 0-40 using KNN Imputer:

In [69]:
imputed_data = knn_imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)
imputed_df['Cylinder Count'] = round(imputed_df['Cylinder Count'])
imputed_df['Drivetrain'] = round(imputed_df['Drivetrain'])
imputed_df['Doors'] = round(imputed_df['Doors'])
imputed_df.head()

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Price
0,2019.0,0.0,10.0,8.0,3.0,7.0,0.0,2.0,0.0,4.0,6.0,10.87,53052.0,9.2,5.0,43880.0
1,2018.0,0.0,10.0,8.0,3.0,1.0,3.0,2.0,0.0,3.0,6.0,10.98,77127.0,9.1,6.0,36486.0
2,2019.0,0.0,10.0,8.0,3.0,1.0,5.0,8.0,0.0,4.0,4.0,9.92,33032.0,10.1,5.0,40888.0
3,2020.0,0.0,10.0,8.0,3.0,1.0,5.0,2.0,2.0,4.0,4.0,9.92,50702.0,10.1,4.0,44599.0
4,2021.0,0.0,10.0,8.0,3.0,7.0,1.0,2.0,2.0,4.0,4.0,10.31,67950.0,9.7,3.0,46989.0


In [84]:
imputed_df.to_csv(r"P:\Personal Projects\Vehicle Price Prediction\Vehicle Price Prediction\Datasets\2. imputed_df.csv",index=False)