In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# `Step 1: Load Raw Data`

In [None]:
# load raw data 
raw_data = Path('../data/raw/car_details.csv')
df = pd.read_csv(raw_data)
print(f'Data loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns')

Data loaded successfully with 8128 rows and 13 columns


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   float64
 9   engine         7907 non-null   float64
 10  max_power      7912 non-null   float64
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(4), int64(3), object(6)
memory usage: 825.6+ KB


# `Data Types Preprocessing`

In [None]:
# convert some categorical datatypes set as 'object' to 'category'
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
for col in categorical_columns:
    print(f"- {col} : {df[col].nunique()}")

- name : 2058
- fuel : 4
- seller_type : 3
- transmission : 2
- owner : 5
- torque : 441


In [None]:
# convert these the data types of these columns to 'category'
columns_to_convert_category = ["fuel", "seller_type","transmission","owner"]
for col in columns_to_convert_category:
    if col in df.columns:
        df[col] = df[col].astype('category')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   name           8128 non-null   object  
 1   year           8128 non-null   int64   
 2   selling_price  8128 non-null   int64   
 3   km_driven      8128 non-null   int64   
 4   fuel           8128 non-null   category
 5   seller_type    8128 non-null   category
 6   transmission   8128 non-null   category
 7   owner          8128 non-null   category
 8   mileage        7907 non-null   float64 
 9   engine         7907 non-null   float64 
 10  max_power      7912 non-null   float64 
 11  torque         7906 non-null   object  
 12  seats          7907 non-null   float64 
dtypes: category(4), float64(4), int64(3), object(2)
memory usage: 604.0+ KB


# `Data Cleaning` 

In [None]:
# missing columns handling
missing_before = df.isnull().sum().sum()
print(f"Missing Before: {missing_before}")
df = df.dropna(how='any')
missing_after = df.isnull().sum().sum()
print(f"Missing After: {missing_after}")

Missing Before: 1101
Missing After: 0


In [8]:
n_duplicates = df.duplicated().sum()
print(f"Duplicates before: {n_duplicates}")
if n_duplicates:
    df = df.drop_duplicates()

print(f"Duplicates after: {df.duplicated().sum()}")

Duplicates before: 1189
Duplicates after: 0


In [14]:
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,age
0,Volvo XC90 T8 Excellence BSIV,2017,16.118096,10.308986,Petrol,Individual,Automatic,First Owner,42.00,44.373415,5.993961,640Nm@ 1740rpm,4.0,3
1,Mercedes-Benz S-Class S 350 CDI,2017,15.581952,10.757924,Diesel,Dealer,Automatic,First Owner,13.50,54.653454,5.645447,490Nm@ 1600rpm,5.0,3
2,Jeep Wrangler 2016-2019 3.6 4X4,2017,15.226498,9.741027,Petrol,Individual,Automatic,First Owner,9.50,60.033324,5.638355,347Nm@ 4300rpm,5.0,3
8,Mercedes-Benz E-Class E350 Petrol,2009,13.815512,11.289794,Petrol,Individual,Automatic,Third Owner,10.93,59.143892,5.609472,355Nm@ 4500rpm,5.0,11
9,Jaguar XF 3.0 Litre S Premium Luxury,2014,14.508658,10.839601,Diesel,Individual,Automatic,First Owner,14.74,54.708317,5.605434,600Nm@ 2000rpm,5.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7901,Maruti Omni MPI STD BSIV,2015,12.100718,10.126671,Petrol,Individual,Manual,Second Owner,16.80,28.213472,3.561046,59Nm@ 2500rpm,5.0,5
7902,Maruti Omni MPI STD BSIV,2016,12.388398,10.596660,Petrol,Individual,Manual,First Owner,16.80,28.213472,3.561046,59Nm@ 2500rpm,5.0,4
7903,Maruti Omni E MPI STD BS IV,2017,12.468441,9.105091,Petrol,Individual,Manual,First Owner,16.80,28.213472,3.561046,59Nm@ 2500rpm,8.0,3
7904,Maruti Omni LPG CARGO BSIII W IMMOBILISER,2010,11.695255,12.043560,LPG,Individual,Manual,Second Owner,10.90,28.213472,3.520461,57Nm@ 2500rpm,5.0,10


# `Feature Engineering`

In [9]:
# Log transform
log_transform = ["selling_price","max_power","km_driven"]
for col in log_transform:
    if col in df.columns:
        df[col] = np.log1p(df[col])


In [10]:
# sqrt transform
sqrt_transform = ['engine']
for col in sqrt_transform:
    if col in df.columns:
        df[col] = np.sqrt(df[col])

In [11]:
df["age"] = df['year'].max() - df['year']

# `Feature Encoding`

# `Feature Scaling`

# `Feature Selection`