In [147]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [148]:
data = pd.read_csv('Automobile_data.csv', na_values = '?') #This data has null values in the form of ? which can not be read, so we convert it into NaN

In [149]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [150]:
data.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [151]:
data.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [152]:
df_obj = data.select_dtypes(include = "O")

In [153]:
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [154]:
df_obj.dtypes

make                object
fuel-type           object
aspiration          object
num-of-doors        object
body-style          object
drive-wheels        object
engine-location     object
engine-type         object
num-of-cylinders    object
fuel-system         object
dtype: object

In [155]:
df_nums = data.select_dtypes(include = 'number')
df_nums.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


# Categorical encoding of object data types


In [156]:
df_obj['make'].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
       'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',
       'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [157]:
df_obj['make'].nunique() #nunique gives us number of different classes

22

In [158]:
df_obj['make'].value_counts() #valuecounts give us number of classes

make
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: count, dtype: int64

In [159]:
df_obj[df_obj.isnull().any(axis =1)]

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [160]:
df_obj['num-of-doors'].value_counts()

num-of-doors
four    114
two      89
Name: count, dtype: int64

In [161]:
df_obj["num-of-doors"].mode()

0    four
Name: num-of-doors, dtype: object

In [162]:
df_obj["num-of-doors"].fillna('four', inplace = True)

In [163]:
df_obj['num-of-cylinders'].value_counts()

num-of-cylinders
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64

In [164]:
df_obj['num-of-cylinders'] = df_obj['num-of-cylinders'].replace({'four': 4,'six':6 ,'five': 5, 'eight' : 8,'two': 2, 'three' :3,'twelve' : 12})

In [165]:
df_obj['num-of-doors'] = df_obj['num-of-doors'].replace({'four': 4,'six':6 ,'five': 5, 'eight' : 8,'two': 2, 'three' :3,'twelve' : 12})

In [166]:
df_obj['fuel-type'].value_counts()

fuel-type
gas       185
diesel     20
Name: count, dtype: int64

In [167]:
df_obj['fuel-type'] = df_obj['fuel-type'].replace({"gas": 1, 'diesel': 2})

In [168]:
df_obj['drive-wheels'].value_counts()

drive-wheels
fwd    120
rwd     76
4wd      9
Name: count, dtype: int64

In [169]:
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,1,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,1,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,1,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,1,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,1,std,4,sedan,4wd,front,ohc,5,mpfi


In [170]:
df_obj['drive-wheels'] = df_obj['drive-wheels'].map({"fwd": 1, 'rwd': 2, '4wd' : 3})

In [171]:
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,1,std,2,convertible,2,front,dohc,4,mpfi
1,alfa-romero,1,std,2,convertible,2,front,dohc,4,mpfi
2,alfa-romero,1,std,2,hatchback,2,front,ohcv,6,mpfi
3,audi,1,std,4,sedan,1,front,ohc,4,mpfi
4,audi,1,std,4,sedan,3,front,ohc,5,mpfi


In [172]:
df_obj[('aspiration')].value_counts()

aspiration
std      168
turbo     37
Name: count, dtype: int64

In [173]:
df_obj['body-style'].value_counts()

body-style
sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: count, dtype: int64

In [174]:
mapping = {"aspiration":    {"std" : 1, 'turbo' : 2},
           "body-style":    {"sedan": 1 ,"hatchback": 2 ,"wagon": 3 , "hardtop" : 4 ,"convertible" : 5}
          }

In [175]:
df_obj[['aspiration','body-style']] = df_obj[['aspiration','body-style']].replace(mapping)

In [176]:
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,1,1,2,5,2,front,dohc,4,mpfi
1,alfa-romero,1,1,2,5,2,front,dohc,4,mpfi
2,alfa-romero,1,1,2,2,2,front,ohcv,6,mpfi
3,audi,1,1,4,1,1,front,ohc,4,mpfi
4,audi,1,1,4,1,3,front,ohc,5,mpfi


In [177]:
df_obj.dtypes

make                object
fuel-type            int64
aspiration           int64
num-of-doors         int64
body-style           int64
drive-wheels         int64
engine-location     object
engine-type         object
num-of-cylinders     int64
fuel-system         object
dtype: object

In [178]:
df_objs1 = df_obj.select_dtypes(include = "O")
df_objs1.head()

Unnamed: 0,make,engine-location,engine-type,fuel-system
0,alfa-romero,front,dohc,mpfi
1,alfa-romero,front,dohc,mpfi
2,alfa-romero,front,ohcv,mpfi
3,audi,front,ohc,mpfi
4,audi,front,ohc,mpfi


# Label Encoding

Another approach for encoding the data , it encodes on the basis of alphabetical order, This appropach is easier

In [179]:
df_objs1["make"].value_counts()

make
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: count, dtype: int64

In [180]:
df_objs1['make'] = df_objs1['make'].astype("category")
df_objs1.dtypes

make               category
engine-location      object
engine-type          object
fuel-system          object
dtype: object

In [181]:
df_objs1['make'] = df_objs1['make'].cat.codes
df_objs1.head()

Unnamed: 0,make,engine-location,engine-type,fuel-system
0,0,front,dohc,mpfi
1,0,front,dohc,mpfi
2,0,front,ohcv,mpfi
3,1,front,ohc,mpfi
4,1,front,ohc,mpfi


In [182]:
df_objs1["make"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21], dtype=int8)

In [183]:
df_objs1["make"].nunique()

22

In [184]:
df_objs1["engine-type"].value_counts()

engine-type
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64

In [185]:
df_objs1['engine-type'] = df_objs1['engine-type'].astype("category")
df_objs1.dtypes

make                   int8
engine-location      object
engine-type        category
fuel-system          object
dtype: object

In [186]:
df_objs1['engine-type'] = df_objs1['engine-type'].cat.codes
df_objs1.head()

Unnamed: 0,make,engine-location,engine-type,fuel-system
0,0,front,0,mpfi
1,0,front,0,mpfi
2,0,front,5,mpfi
3,1,front,3,mpfi
4,1,front,3,mpfi


In [187]:
df_objs1["engine-type"].unique()

array([0, 5, 3, 2, 6, 4, 1], dtype=int8)

In [188]:
df_objs1["engine-type"].nunique()

7

In [189]:
df_objs1["engine-location"].value_counts()

engine-location
front    202
rear       3
Name: count, dtype: int64

In [190]:
df_objs1["fuel-system"].value_counts()

fuel-system
mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: count, dtype: int64

# Using One-Hot-Encoding for encoding the remaining two classes

In [191]:
ohe = OneHotEncoder(sparse = False)
encoded_data = ohe.fit_transform(df_objs1[["engine-location"]])
encoded_data



array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

Converting this array into a DataFrame

In [192]:
encoded_data = pd.DataFrame(encoded_data, columns = ohe.categories_) #.categories provides label names to the dataframe
encoded_data.head()

Unnamed: 0,front,rear
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
