In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = r"D:\ml-algorithms\datasets\autos_dataset.csv"

In [3]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [5]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [6]:
df.shape

(205, 26)

In [7]:
df.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

#### There are Null values preasent in the dataset

In [8]:
(df == '?').sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

#### replace the `?` with NaN values and then find the percentage of missing values
 - if missing value is >=25 % it is better to drop such columsn 

In [9]:
df.replace('?',np.nan,inplace=True)
for col in df.columns:
    # Try to convert in numeric features
    try:
        df[col] = pd.to_numeric(df[col])
    except:
        pass

In [10]:
#  Check the colums are converted into numeric
# df.info()

In [11]:
### Findout missing value percentage
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100

In [12]:
missing_info = pd.DataFrame({
    'Missing_Values': missing_count,
    'Percentage':missing_percent
})
print(missing_info[missing_info['Missing_Values']>0])

                   Missing_Values  Percentage
normalized-losses              41    20.00000
num-of-doors                    2     0.97561
bore                            4     1.95122
stroke                          4     1.95122
horsepower                      2     0.97561
peak-rpm                        2     0.97561
price                           4     1.95122


In [13]:
### Check for skewness of this features 
missing_cols = ['normalized-losses', 'num-of-doors', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']

In [14]:
### Calculate skewness
for col in missing_cols:
    if pd.api.types.is_numeric_dtype(df[col]):
        skew_val = round(df[col].skew(),2)
        print(f"Skewness of  {col}: {skew_val:.2f}")
    else:
        print(f"Skipping {col}: Not Numeric column")

Skewness of  normalized-losses: 0.77
Skipping num-of-doors: Not Numeric column
Skewness of  bore: 0.02
Skewness of  stroke: -0.68
Skewness of  horsepower: 1.39
Skewness of  peak-rpm: 0.07
Skewness of  price: 1.81


- Remove rows where the target column (price) is missing. Do not impute these values. This ensures your model is trained only on real, observed outcomes and maintains the integrity of your analysis.

In [15]:
## Remove missing values from `price`
df.dropna(subset=['price'],inplace=True)

In [16]:
### fill the missing values in categorical data
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0],inplace=True)

In [17]:
### Fill the missing values in contineous features
df['normalized-losses'].fillna(df['normalized-losses'].median(),inplace=True) # 0.77 skewness

In [18]:
df['stroke'].fillna(df['stroke'].median(),inplace=True) # -0.68 skewness

In [19]:
df['horsepower'].fillna(df['horsepower'].median(),inplace=True) # 1.39 skewness

In [20]:
df['bore'].fillna(df['bore'].mean(),inplace=True) # 0.02 skewness

In [21]:
df['peak-rpm'].fillna(df['peak-rpm'].mean(),inplace=True)  # 0.07 skewness

In [22]:
# Check for any missing values
# df.isnull().sum()

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  201 non-null    float64
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non-null

### convert these categorical features FIRST

In [24]:
### check unique values for each categorical feature

categorical_cols = ['make','fuel-type','aspiration','num-of-doors',
                    'body-style','drive-wheels','engine-location','engine-type',
                    'num-of-cylinders','fuel-system']

- Step 1: Analyze Each Categorical Feature

In [25]:
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())


make:
make
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
subaru           12
volkswagen       12
volvo            11
peugot           11
dodge             9
mercedes-benz     8
bmw               8
plymouth          7
audi              6
saab              6
porsche           4
chevrolet         3
alfa-romero       3
jaguar            3
isuzu             2
renault           2
mercury           1
Name: count, dtype: int64

fuel-type:
fuel-type
gas       181
diesel     20
Name: count, dtype: int64

aspiration:
aspiration
std      165
turbo     36
Name: count, dtype: int64

num-of-doors:
num-of-doors
four    115
two      86
Name: count, dtype: int64

body-style:
body-style
sedan          94
hatchback      68
wagon          25
hardtop         8
convertible     6
Name: count, dtype: int64

drive-wheels:
drive-wheels
fwd    118
rwd     75
4wd      8
Name: count, dtype: int64

engine-location:
engine-location
front    198
rear       3
Name:

* Step 2: Choose Appropriate Encoding

    - For Ordinal Features (natural ordering):

In [26]:
# num-of-cylinders might be ordinal
cylinder_mapping = {'two':2,'three':3,
                    'four':4,'five':5,
                    'six':6,'eight':8}
df['num-of-cylinders'] = df['num-of-cylinders'].map(cylinder_mapping)

In [27]:
### num-of-doors
door_mapping = {'tow':2,'four':4}
df['num-of-doors'] = df['num-of-doors'].map(door_mapping)

    - For Nominal Features (no natural ordering):

In [28]:
# Use one-hot encoding for features like make, fuel-type, body-style
df = pd.get_dummies(df,columns=['make','fuel-type','aspiration','body-style',
'drive-wheels','engine-location','engine-type','fuel-system'],drop_first=True,dtype=int)
# Add dtype parameter to get integer values directly

In [29]:
# print(df.dtypes)

### Handel Skewness & Outliers

#### Detect outliers

    - ONLY continuous features - exclude ordinal/discrete features

In [30]:
continuous_features = [
    'normalized-losses', 'wheel-base', 'length', 'width', 
    'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 
    'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg'
]

In [31]:
# Remove any features that don't exist in your dataframe
continuous_features = [col for col in continuous_features if col in df.columns]

In [32]:
print("Continuous features for outlier detection:")
print(continuous_features)

Continuous features for outlier detection:
['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']


In [33]:
### Detect outliers
outlier_summary = {}

for col in continuous_features:

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (df[col] < lower_bound) | (df[col]>upper_bound)
    outlier_count = outliers.sum()
    outlier_summary[col] = outlier_count

    print(f"{col}: {outlier_count} outliers")

# Show most problematic features
sorted_outliers = sorted(outlier_summary.items(), key=lambda x: x[1],reverse=True)
print('*'*50)
print("Ensure with most-outliers")
for feature,count in sorted_outliers[:5]:
    print(f"{feature}: {count}")

normalized-losses: 8 outliers
wheel-base: 3 outliers
length: 1 outliers
width: 11 outliers
height: 0 outliers
curb-weight: 2 outliers
engine-size: 10 outliers
bore: 0 outliers
stroke: 20 outliers
compression-ratio: 27 outliers
horsepower: 5 outliers
peak-rpm: 2 outliers
city-mpg: 2 outliers
highway-mpg: 3 outliers
**************************************************
Ensure with most-outliers
compression-ratio: 27
stroke: 20
width: 11
engine-size: 10
normalized-losses: 8


* Features where outliers might be legitimate (keep them):
    - price (expensive luxury cars)
    - horsepower (sports cars)
    - engine-size (luxury/sports cars)

* Features where outliers might be errors (consider capping):
    - normalized-losses, compression-ratio

In [34]:
# Set outlier treatment thresholds
signiicant_outlier_threshold = 10 # Features with > 10 outliers

problematic_features = [col for col, count in  outlier_summary.items() if count > signiicant_outlier_threshold]

print(f"Features with > 10 outliers: {problematic_features}")

Features with > 10 outliers: ['width', 'stroke', 'compression-ratio']
