In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/HypnoticShield/ML/refs/heads/main/pre-ownedcars.csv')

In [4]:
missing_values_count = df.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values_count)

Missing Values in Each Column:
brand                     1
model                     1
transmission              1
make_year                 1
reg_year               2086
fuel_type                 1
engine_capacity(CC)     118
km_driven                 1
ownership                 1
price                     0
overall_cost              1
has_insurance             1
spare_key                 1
reg_number                1
title                     1
dtype: int64


In [5]:
most_missing = missing_values_count[missing_values_count == missing_values_count.max()]
print("\nColumns with the Most Missing Values:")
print(most_missing)


Columns with the Most Missing Values:
reg_year    2086
dtype: int64


In [6]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)


In [7]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)
  df[col].fillna(mode_value, inplace=True)


In [8]:
print("\nDataset after Handling Missing Values:")
print(df.head())


Dataset after Handling Missing Values:
      brand                                     model transmission  make_year  \
0  Mahindra              Thar LX D 4WD MT CONVERTIBLE       Manual     2020.0   
1   Hyundai                         Verna 1.6 VTVT SX       Manual     2018.0   
2      Tata  Harrier XT PLUS 2.0L KRYOTEC DARK EDITON       Manual     2022.0   
3     Honda                     City 1.5L I-VTE V CVT    Automatic     2023.0   
4      Ford             Ecosport TITANIUM 1.5L DIESEL       Manual     2021.0   

     reg_year fuel_type  engine_capacity(CC)  km_driven  ownership    price  \
0  01-01-2021    Diesel               2184.0    11003.0  1st owner  1231000   
1  01-07-2018    Petrol               1591.0    66936.0  1st owner   786000   
2  01-08-2022    Diesel               1956.0    27990.0  1st owner  1489000   
3  01-04-2023    Petrol               1498.0     5061.0  1st owner  1227000   
4  01-07-2021    Diesel               1498.0    23480.0  1st owner   887000   

In [9]:
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (df[numerical_cols] < (Q1 - 1.5 * IQR)) | (df[numerical_cols] > (Q3 + 1.5 * IQR))


In [10]:
for col in numerical_cols:
    if outlier_condition[col].any():
        median_value = df[col].median()
        df.loc[outlier_condition[col], col] = median_value

In [11]:
print("\nDataset after Handling Outliers:")
print(df.head())


Dataset after Handling Outliers:
      brand                                     model transmission  make_year  \
0  Mahindra              Thar LX D 4WD MT CONVERTIBLE       Manual     2020.0   
1   Hyundai                         Verna 1.6 VTVT SX       Manual     2018.0   
2      Tata  Harrier XT PLUS 2.0L KRYOTEC DARK EDITON       Manual     2022.0   
3     Honda                     City 1.5L I-VTE V CVT    Automatic     2023.0   
4      Ford             Ecosport TITANIUM 1.5L DIESEL       Manual     2021.0   

     reg_year fuel_type  engine_capacity(CC)  km_driven  ownership    price  \
0  01-01-2021    Diesel               1197.0    11003.0  1st owner  1231000   
1  01-07-2018    Petrol               1591.0    66936.0  1st owner   786000   
2  01-08-2022    Diesel               1197.0    27990.0  1st owner   594000   
3  01-04-2023    Petrol               1498.0     5061.0  1st owner  1227000   
4  01-07-2021    Diesel               1498.0    23480.0  1st owner   887000   

   o

In [12]:
print("\nFinal Cleaned Dataset:")
print(df)


Final Cleaned Dataset:
         brand                                     model transmission  \
0     Mahindra              Thar LX D 4WD MT CONVERTIBLE       Manual   
1      Hyundai                         Verna 1.6 VTVT SX       Manual   
2         Tata  Harrier XT PLUS 2.0L KRYOTEC DARK EDITON       Manual   
3        Honda                     City 1.5L I-VTE V CVT    Automatic   
4         Ford             Ecosport TITANIUM 1.5L DIESEL       Manual   
...        ...                                       ...          ...   
2801      Ford           Ecosport TREND + 1.5L PETROL AT    Automatic   
2802     Honda                        Jazz 1.2L I-VTEC V       Manual   
2803   Hyundai                       Elite i20 MAGNA 1.2       Manual   
2804    Maruti                   Baleno DELTA PETROL 1.2       Manual   
2805    Maruti                   Baleno DELTA PETROL 1.2       Manual   

        make_year    reg_year fuel_type  engine_capacity(CC)     km_driven  \
0     2020.000000  01

In [13]:
print("\nSummary of Preprocessing Steps:")
print(f"1. Identified missing values in columns: {missing_values_count.sum()} total missing values handled.")
print(f"2. Filled numerical columns with mean/median and categorical columns with mode.")
print(f"3. Detected and handled outliers in numerical columns using IQR method.")


Summary of Preprocessing Steps:
1. Identified missing values in columns: 2216 total missing values handled.
2. Filled numerical columns with mean/median and categorical columns with mode.
3. Detected and handled outliers in numerical columns using IQR method.
