In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# `Step 1: Load Raw Data`

In [None]:
# load raw data 
raw_data = Path('../data/raw/car_details.csv')
df = pd.read_csv(raw_data)
print(f'Data loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns')

Data loaded successfully with 8128 rows and 13 columns


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   float64
 9   engine         7907 non-null   float64
 10  max_power      7912 non-null   float64
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(4), int64(3), object(6)
memory usage: 825.6+ KB


# `Data Types Preprocessing`

In [None]:
# convert some categorical datatypes set as 'object' to 'category'
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
for col in categorical_columns:
    print(f"- {col} : {df[col].nunique()}")

- name : 2058
- fuel : 4
- seller_type : 3
- transmission : 2
- owner : 5
- torque : 441


In [None]:
# convert these the data types of these columns to 'category'
columns_to_convert_category = ["fuel", "seller_type","transmission","owner"]
for col in columns_to_convert_category:
    if col in df.columns:
        df[col] = df[col].astype('category')

In [None]:
df['year']

0      1970-01-01 00:00:00.000002017
1      1970-01-01 00:00:00.000002017
2      1970-01-01 00:00:00.000002017
8      1970-01-01 00:00:00.000002009
9      1970-01-01 00:00:00.000002014
                    ...             
7901   1970-01-01 00:00:00.000002015
7902   1970-01-01 00:00:00.000002016
7903   1970-01-01 00:00:00.000002017
7904   1970-01-01 00:00:00.000002010
7905   1970-01-01 00:00:00.000002007
Name: year, Length: 6717, dtype: datetime64[ns]

In [6]:
# covert "year" column to datetime 
df['year'] = pd.to_datetime(df['year'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   name           8128 non-null   object        
 1   year           8128 non-null   datetime64[ns]
 2   selling_price  8128 non-null   int64         
 3   km_driven      8128 non-null   int64         
 4   fuel           8128 non-null   category      
 5   seller_type    8128 non-null   category      
 6   transmission   8128 non-null   category      
 7   owner          8128 non-null   category      
 8   mileage        7907 non-null   float64       
 9   engine         7907 non-null   float64       
 10  max_power      7912 non-null   float64       
 11  torque         7906 non-null   object        
 12  seats          7907 non-null   float64       
dtypes: category(4), datetime64[ns](1), float64(4), int64(2), object(2)
memory usage: 604.0+ KB


# `Data Cleaning` 

In [8]:
# missing columns handling
missing_before = df.isnull().sum().sum()
print(f"Missing Before: {missing_before}")
df = df.dropna(how='any')
missing_after = df.isnull().sum().sum()
print(f"Missing After: {missing_after}")

Missing Before: 1101
Missing After: 0


In [9]:
n_duplicates = df.duplicated().sum()
print(f"Duplicates before: {n_duplicates}")
if n_duplicates:
    df = df.drop_duplicates()

print(f"Duplicates after: {df.duplicated().sum()}")

Duplicates before: 1189
Duplicates after: 0


# `Feature Engineering`

In [10]:
df['year'].dt.year.max()

np.int32(1970)

In [10]:
df['year'].dt.year.min()

np.int32(1970)

# `Feature Encoding`

# `Feature Scaling`

In [11]:
# Log transform
log_transform = ["selling_price","max_power","km_driven"]
for col in log_transform:
    if col in df.columns:
        df[col] = np.log1p(df[col])


In [12]:
# square transform
square_transform = ["engine"]
for col in square_transform:
    if col in df.columns:
        df[col] = np.log1p(df[col])

# `Feature Selection`