In [1]:
import numpy as np
import pandas as pd

# Machine Learning Libraries Import
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Machine Learning Evaluation Matrix
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

import os
import warnings
warnings.filterwarnings('ignore')

### Problem Statement
    -  Preadict the `Price` of car using various features

In [3]:
file_path = r"D:\ml-algorithms\datasets\autos_dataset.csv"

In [4]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [10]:
### Replace '?' with NaN and convert that column in int datatype
df.replace('?',np.nan,inplace=True)
for col in df.columns:
    # Try to convert in numeric feature
    try:
        df[col] = pd.to_numeric(df[col])
    except:
        pass

In [None]:
# df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       199 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non-null

In [12]:
## FindOut Missing values in the dataset and fill with proper stastical method
df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [18]:
### We need to replace the missing values for that features
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing_Values':missing_count,
    'Percentage':missing_percent
})
print(missing_info[missing_info['Missing_Values']>0])

                   Missing_Values  Percentage
normalized-losses              41    20.00000
num-of-doors                    2     0.97561
bore                            4     1.95122
stroke                          4     1.95122
horsepower                      2     0.97561
peak-rpm                        2     0.97561
price                           4     1.95122


In [None]:
# We need to fill the missing values from that features
# First find out the `Skewness` in the datast and then fill the missing values

In [21]:
### Find the skewness for this features
missing_cols = ['normalized-losses', 'num-of-doors', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']

# Calculate the skewness 
for col in missing_cols:
    if pd.api.types.is_numeric_dtype(df[col]):
        skew_val = round(df[col].skew(),2)
        print(f"Skewness of {col}: {skew_val:.2f}")
    else:
        print(f"Skipping {col}: Not numeric column")

Skewness of normalized-losses: 0.77
Skipping num-of-doors: Not numeric column
Skewness of bore: 0.02
Skewness of stroke: -0.68
Skewness of horsepower: 1.39
Skewness of peak-rpm: 0.07
Skewness of price: 1.81


In [22]:
## `price` is our target column we simply drop the rows 
df.dropna(subset=['price'],inplace=True)
# This keeps all rows except those where price is missing.

In [None]:
#  normalized-losses	0.77	Median (moderately skewed)
#  num-of-doors	        N/A  	Mode (categorical)
#  bore	                0.02	Mean (approximately normal)
#  stroke	           -0.68	Median (moderately skewed)
#  horsepower	        1.39	Median (highly skewed)
#  peak-rpm	            0.07    Mean (approximately normal)

In [25]:
df['normalized-losses'].fillna(df['normalized-losses'].median())
df['bore'].fillna(df['bore'].mean())
df['stroke'].fillna(df['stroke'].median())
df['horsepower'].fillna(df['horsepower'].median())

0      111.0
1      111.0
2      154.0
3      102.0
4      115.0
       ...  
200    114.0
201    160.0
202    134.0
203    106.0
204    114.0
Name: horsepower, Length: 201, dtype: float64

In [28]:
## For categorical features
print(df['num-of-doors'].isnull().sum())

0


In [None]:
## Fill the missing valueus
df['num-of-doors'] = df['num-of-doors'].fillna(df['num-of-doors'].mode()[0])

In [29]:
### convert the categorical columns in numeric
cat_col = df.select_dtypes(include='object').columns
print(cat_col)

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders',
       'fuel-system'],
      dtype='object')


| **Encoding Type**    | **Use When**                               | **Example**                 | **How to Apply**                              |
| -------------------- | ------------------------------------------ | --------------------------- | --------------------------------------------- |
| **Manual Mapping**   | When categories are numeric in nature      | `num-of-doors`: `'two'` → 2 | `.map({'two': 2, 'four': 4})`                 |
| **Label Encoding**   | When categories have **order (ordinal)**   | `size`: small < med < large | `.map({'small': 0, 'medium': 1, 'large': 2})` |
| **One-Hot Encoding** | When categories are **nominal (no order)** | `fuel-type`: gas, diesel    | `pd.get_dummies(..., drop_first=True)`        |


In [30]:
### Manually map the simple columns
df['num-of-doors'] = df['num-of-doors'].map({
    'two':2,
    'four':4
})

In [31]:
# Ordinal Encoding
df['num-of-cylinders'] = df['num-of-cylinders'].map({
    'two':2,
    'three':3,
    'four':4,
    'five':5,
    'six':6,
    'eight':8,
    'twelve':12
})

In [32]:
## for rest of the columns use One-hot encoding
df = pd.get_dummies(df,columns=['fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location','fuel-system', 'make', 'engine-type'],drop_first=True)

In [33]:
df.head()

Unnamed: 0,symboling,normalized-losses,num-of-doors,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,...,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor
0,3,115.0,2,88.6,168.8,64.1,48.8,2548,4,130,...,False,False,False,False,False,False,False,False,False,False
1,3,115.0,2,88.6,168.8,64.1,48.8,2548,4,130,...,False,False,False,False,False,False,False,False,False,False
2,1,115.0,2,94.5,171.2,65.5,52.4,2823,6,152,...,False,False,False,False,False,False,False,False,True,False
3,2,164.0,4,99.8,176.6,66.2,54.3,2337,4,109,...,False,False,False,False,False,False,True,False,False,False
4,2,164.0,4,99.4,176.6,66.4,54.3,2824,5,136,...,False,False,False,False,False,False,True,False,False,False


In [34]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
symboling,201.0,0.840796,1.254802,-2.0,0.0,1.0,2.0,3.0
normalized-losses,201.0,120.711443,32.111623,65.0,101.0,115.0,137.0,256.0
num-of-doors,201.0,3.144279,0.992008,2.0,2.0,4.0,4.0,4.0
wheel-base,201.0,98.797015,6.066366,86.6,94.5,97.0,102.4,120.9
length,201.0,174.200995,12.322175,141.1,166.8,173.2,183.5,208.1
width,201.0,65.889055,2.101471,60.3,64.1,65.5,66.6,72.0
height,201.0,53.766667,2.447822,47.8,52.0,54.1,55.5,59.8
curb-weight,201.0,2555.666667,517.296727,1488.0,2169.0,2414.0,2926.0,4066.0
num-of-cylinders,201.0,4.363184,1.059452,2.0,4.0,4.0,4.0,12.0
engine-size,201.0,126.875622,41.546834,61.0,98.0,120.0,141.0,326.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 60 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   symboling             201 non-null    int64  
 1   normalized-losses     201 non-null    float64
 2   num-of-doors          201 non-null    int64  
 3   wheel-base            201 non-null    float64
 4   length                201 non-null    float64
 5   width                 201 non-null    float64
 6   height                201 non-null    float64
 7   curb-weight           201 non-null    int64  
 8   num-of-cylinders      201 non-null    int64  
 9   engine-size           201 non-null    int64  
 10  bore                  201 non-null    float64
 11  stroke                201 non-null    float64
 12  compression-ratio     201 non-null    float64
 13  horsepower            201 non-null    float64
 14  peak-rpm              199 non-null    float64
 15  city-mpg              201 no

In [37]:
## Detect the outliers using IQR method
def detect_outliers_iqr(df,features):
    outlier_indices = {}

    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)

        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower) | (df[col] > upper)]
        outlier_indices[col] = outliers.index.tolist()

    return outlier_indices

# Get only numeric columns
num_cols = df.select_dtypes(include=['int64','float']).columns

In [39]:
## Detect the outliers
outlier_indices_dict = detect_outliers_iqr(df,num_cols)
print(outlier_indices_dict)

{'symboling': [], 'normalized-losses': [10, 11, 104, 105, 106, 178, 179, 190], 'num-of-doors': [], 'wheel-base': [70, 71, 73], 'length': [18], 'width': [6, 7, 8, 17, 18, 49, 70, 71, 72, 73, 74], 'height': [], 'curb-weight': [47, 48], 'num-of-cylinders': [2, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 47, 48, 49, 55, 56, 57, 58, 67, 68, 69, 70, 71, 72, 73, 74, 101, 102, 103, 104, 105, 106, 126, 127, 128, 178, 179, 180, 181, 191, 202, 203], 'engine-size': [15, 16, 17, 47, 48, 49, 71, 72, 73, 74], 'bore': [], 'stroke': [29, 47, 48, 111, 113, 130, 131, 134, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], 'compression-ratio': [29, 49, 63, 66, 67, 68, 69, 70, 82, 83, 84, 90, 108, 110, 112, 114, 116, 117, 124, 158, 159, 174, 182, 184, 187, 192, 203], 'horsepower': [49, 105, 126, 127, 128], 'peak-rpm': [165, 166], 'city-mpg': [18, 30], 'highway-mpg': [18, 30, 90], 'price': [15, 16, 17, 47, 48, 49, 70, 71, 72, 73, 74, 126, 127, 128]}
