In [1]:
link='https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'      # DataSet Link

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
headers=["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

In [46]:
df=pd.read_csv(link,names=headers)

In [47]:
df.head()               # Uncleaned Data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [48]:
# Handling the Missing Values
df.replace('?',np.nan,inplace=True)

In [49]:
df.head()             # After Handling the missing values and changing them to NaN

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [50]:
# Finding the Missing Data
missing_data=df.isnull()
missing_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [51]:
for column in missing_data.columns:
    print(column)
    print(missing_data[column].value_counts())
    print()

symboling
False    205
Name: symboling, dtype: int64

normalized-losses
False    164
True      41
Name: normalized-losses, dtype: int64

make
False    205
Name: make, dtype: int64

fuel-type
False    205
Name: fuel-type, dtype: int64

aspiration
False    205
Name: aspiration, dtype: int64

num-of-doors
False    203
True       2
Name: num-of-doors, dtype: int64

body-style
False    205
Name: body-style, dtype: int64

drive-wheels
False    205
Name: drive-wheels, dtype: int64

engine-location
False    205
Name: engine-location, dtype: int64

wheel-base
False    205
Name: wheel-base, dtype: int64

length
False    205
Name: length, dtype: int64

width
False    205
Name: width, dtype: int64

height
False    205
Name: height, dtype: int64

curb-weight
False    205
Name: curb-weight, dtype: int64

engine-type
False    205
Name: engine-type, dtype: int64

num-of-cylinders
False    205
Name: num-of-cylinders, dtype: int64

engine-size
False    205
Name: engine-size, dtype: int64

fuel-system
Fa

# Dealing with Missing Datas

In [35]:
# To deal with missing datas, I will now change the respective NaN to the column mean

In [53]:
# Handling the missing places of Normalised-loses
norm_loss_avg=df['normalized-losses'].astype('float').mean(axis=0)
df['normalized-losses'].replace(np.nan,norm_loss_avg,inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [55]:
norm_loss_avg

122.0

In [58]:
# Replcing NaN from "bore" column to the mean of the column
bore_avg=df['bore'].astype('float').mean(axis=0)
bore_avg

3.3297512437810957

In [60]:
df['bore'].replace(np.nan,bore_avg,inplace=True)
df['bore'].head()

0    3.47
1    3.47
2    2.68
3    3.19
4    3.19
Name: bore, dtype: object

In [61]:
# Replcing NaN from "stroke" column to the mean of the column
stroke_avg=df['stroke'].astype('float').mean(axis=0)
stroke_avg

3.2554228855721337

In [62]:
df['stroke'].replace(np.nan,stroke_avg,inplace=True)

In [66]:
# Replcing NaN from "horsepower" column to the mean of the column
horsepower_avg=df['horsepower'].astype('float').mean(axis=0)
horsepower_avg

104.25615763546799

In [67]:
df['horsepower'].replace(np.nan,horsepower_avg,inplace=True)

In [72]:
# Replcing NaN from "peak-rpm" column to the mean of the column
peak_rpm_avg=df['peak-rpm'].astype('float').mean(axis=0)
peak_rpm_avg

5125.369458128079

In [79]:
df['peak-rpm'].replace(np.nan,peak_rpm_avg,inplace=True)

In [89]:
# Replcing NaN from "num-of-doors" column to the frequency of the column
df['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

In [91]:
freq=df['num-of-doors'].value_counts().idxmax()

In [92]:
df['num-of-doors'].replace(np.nan,freq,inplace=True)

In [93]:
# Droping the rows with "price" column as NaN since we want to predict the price value
df.dropna(subset=['price'],axis=0,inplace=True)

In [94]:
# reseting the index number
df.reset_index(drop=True,inplace=True)

In [95]:
df.head()          # DataSet with no missing values

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


# Correcting the Data Format

In [98]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [105]:
# Convert data types to proper format
df['normalized-losses']=df['normalized-losses'].astype('int')
df[['bore','stroke','price','peak-rpm']]=df[['bore','stroke','price','peak-rpm']].astype('float')

In [106]:
df.dtypes

symboling              int64
normalized-losses      int32
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower            object
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [107]:
# Data Standardization
df['city-L/100km']=235/df['city-mpg']

In [108]:
df[]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000.0,21,27,13495.0,11.190476
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000.0,21,27,16500.0,11.190476
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,mpfi,2.68,3.47,9.0,154,5000.0,19,26,16500.0,12.368421
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,mpfi,3.19,3.4,10.0,102,5500.0,24,30,13950.0,9.791667
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,mpfi,3.19,3.4,8.0,115,5500.0,18,22,17450.0,13.055556


# Data Normalization

In [126]:
df['length']=df['length']/df['length'].max()
df['width']=df['width']/df['width'].max()
df['height']=df['height']/df['height'].max()
df[['length','width','height']].head()

Unnamed: 0,length,width,height
0,0.811148,0.890278,0.816054
1,0.811148,0.890278,0.816054
2,0.822681,0.909722,0.876254
3,0.84863,0.919444,0.908027
4,0.84863,0.922222,0.908027


# Introducing Dummy values

In [133]:
dummy_var1=pd.get_dummies(df['aspiration'])
dummy_var1.rename(columns={'std':'aspiration-std','turbo':'aspiration-turbo'},inplace=True)
dummy_var1.head()

Unnamed: 0,aspiration-std,aspiration-turbo
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [150]:
df['aspiration-std']=dummy_var1['aspiration-std']
df['aspiration-turbo']=dummy_var1['aspiration-turbo']

In [151]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,price,city-L/100km,highway-mpg-L/100km,aspiration-std,aspiration-turbo
0,3,122,alfa-romero,gas,two,convertible,rwd,front,88.6,0.811148,...,2.68,9.0,111,5000.0,21,13495.0,11.190476,8.703704,1,0
1,3,122,alfa-romero,gas,two,convertible,rwd,front,88.6,0.811148,...,2.68,9.0,111,5000.0,21,16500.0,11.190476,8.703704,1,0
2,1,122,alfa-romero,gas,two,hatchback,rwd,front,94.5,0.822681,...,3.47,9.0,154,5000.0,19,16500.0,12.368421,9.038462,1,0
3,2,164,audi,gas,four,sedan,fwd,front,99.8,0.84863,...,3.4,10.0,102,5500.0,24,13950.0,9.791667,7.833333,1,0
4,2,164,audi,gas,four,sedan,4wd,front,99.4,0.84863,...,3.4,8.0,115,5500.0,18,17450.0,13.055556,10.681818,1,0


In [157]:
dummy_var2=pd.get_dummies(df['fuel-type'])
dummy_var2.rename(columns={'diesel':'fuel-diesel','gas':'fuel-gas'},inplace=True)
dummy_var2.head()

Unnamed: 0,fuel-diesel,fuel-gas
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [158]:
df['fuel-diesel']=dummy_var2['fuel-diesel']
df['fuel-gas']=dummy_var2['fuel-gas']
df.head()                                    # Dummies Created

Unnamed: 0,symboling,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,horsepower,peak-rpm,city-mpg,price,city-L/100km,highway-mpg-L/100km,aspiration-std,aspiration-turbo,fuel-diesel,fuel-gas
0,3,122,alfa-romero,gas,two,convertible,rwd,front,88.6,0.811148,...,111,5000.0,21,13495.0,11.190476,8.703704,1,0,0,1
1,3,122,alfa-romero,gas,two,convertible,rwd,front,88.6,0.811148,...,111,5000.0,21,16500.0,11.190476,8.703704,1,0,0,1
2,1,122,alfa-romero,gas,two,hatchback,rwd,front,94.5,0.822681,...,154,5000.0,19,16500.0,12.368421,9.038462,1,0,0,1
3,2,164,audi,gas,four,sedan,fwd,front,99.8,0.84863,...,102,5500.0,24,13950.0,9.791667,7.833333,1,0,0,1
4,2,164,audi,gas,four,sedan,4wd,front,99.4,0.84863,...,115,5500.0,18,17450.0,13.055556,10.681818,1,0,0,1


In [160]:
df.to_csv('Car_DataSet(Cleaned).csv')            # Creating the Cleaned Data