In [47]:
import pandas as pd 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [48]:
#The data doesn't contain a header, so you will define those column names in a variable that's named col_names to the attributes listed in the dataset description. 
url = "imports-85.csv" 
col_names=['symboling','normalized-losses','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base',
                                    'length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size',
                                    'fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

df_car = pd.read_csv(url,sep=',',names = col_names ,na_values="?",  header=None) #sep used to tell red the separator is a comma  , header=None to tell pandas that the data doesn't contain a header , na_values to replace the "?" with NaN





In [49]:
df_car.shape # to get the number of rows and columns in the dataset 


(205, 25)

In [50]:
df_car.head(5) # to get the first 5 rows of the dataset 

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [51]:
df_car.info() # to get the information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   fuel-type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num-of-doors       203 non-null    object 
 5   body-style         205 non-null    object 
 6   drive-wheels       205 non-null    object 
 7   engine-location    205 non-null    object 
 8   wheel-base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb-weight        205 non-null    int64  
 13  engine-type        205 non-null    object 
 14  num-of-cylinders   205 non-null    object 
 15  engine-size        205 non-null    int64  
 16  fuel-system        205 non

In [52]:
df_car.columns # to get the columns names of the dataset

Index(['symboling', 'normalized-losses', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'], dtype='object')

In [53]:
df_car = df_car[[ 'aspiration', 'num-of-doors',  'drive-wheels',  'num-of-cylinders']].copy() # to select the columns that we need , drop the rest of the columns 

In [54]:
df_car.head(5) # to get the first 5 rows of the dataset

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders
0,std,two,rwd,four
1,std,two,rwd,four
2,std,two,rwd,six
3,std,four,fwd,four
4,std,four,4wd,five


Most machine learning algorithms require inputs that are numerical values.

The num-of-cylinders and num-of-doors features have an ordinal value. You could convert the values of these features into their numerical counterparts.
However, aspiration and drive-wheels don't have an ordinal value. These features must be converted differently.

### Encoding ordinal features

using mapper function to convert ordinal features into ordered numerical values 

In [55]:
df_car.info() # to get the information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   aspiration        205 non-null    object
 1   num-of-doors      203 non-null    object
 2   drive-wheels      205 non-null    object
 3   num-of-cylinders  205 non-null    object
dtypes: object(4)
memory usage: 6.5+ KB


In [56]:
df_car['num-of-doors'].value_counts() # to get the count of each value in the column num-of-doors 

num-of-doors
four    114
two      89
Name: count, dtype: int64

In [57]:
#This feature only has two values: four and two. You can create a simple mapper that contains a dictionary: 
door_mapper = {"two": 2,
              "four": 4}

In [58]:
#You can then use the replace method from """pandas""" to generate a new numerical column based on the num-of-doors column:
df_car['doors'] = df_car['num-of-doors'].replace(door_mapper)  

  df_car['doors'] = df_car['num-of-doors'].replace(door_mapper)


In [59]:
df_car.head() 

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders,doors
0,std,two,rwd,four,2.0
1,std,two,rwd,four,2.0
2,std,two,rwd,six,2.0
3,std,four,fwd,four,4.0
4,std,four,4wd,five,4.0


In [60]:
df_car['num-of-cylinders'].value_counts() # to get the count of each value in the column num-of-cylinders ] 

num-of-cylinders
four      159
six        24
five       11
eight       5
two         4
twelve      1
three       1
Name: count, dtype: int64

In [61]:
cylinder_mapper = {"two": 2,  "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}  # create a mapper for the num-of-cylinders column

In [62]:
df_car ['cylinders'] = df_car['num-of-cylinders'].replace(cylinder_mapper) # to create a new column that contains the numerical values of the num-of-cylinders column

  df_car ['cylinders'] = df_car['num-of-cylinders'].replace(cylinder_mapper) # to create a new column that contains the numerical values of the num-of-cylinders column


In [63]:
df_car.head(20) # to get the first 20 rows of the dataset

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders,doors,cylinders
0,std,two,rwd,four,2.0,4
1,std,two,rwd,four,2.0,4
2,std,two,rwd,six,2.0,6
3,std,four,fwd,four,4.0,4
4,std,four,4wd,five,4.0,5
5,std,two,fwd,five,2.0,5
6,std,four,fwd,five,4.0,5
7,std,four,fwd,five,4.0,5
8,turbo,four,fwd,five,4.0,5
9,turbo,two,4wd,five,2.0,5


### Encoding non-ordinal categorical data

ou will encode non-ordinal data by using the get_dummies method from pandas.

The two remaining features are not ordinal.

According to the attribute description, the following values are possible:

aspiration: std, turbo.
drive-wheels: 4wd, fwd, rwd.
You might think that the correct strategy is to convert these values into numerical values. For example, consider the drive-wheels feature. You could use 4wd = 1, fwd = 2, and rwd = 3. However, fwd isn't less than rwd. These values don't have an order, but you just introduced an order to them by assigning these numerical values.


In [64]:
#pandas provides a get_dummies method, which converts the data into binary features
df_car['drive-wheels'].value_counts()

drive-wheels
fwd    120
rwd     76
4wd      9
Name: count, dtype: int64

In [65]:
#Use the `get_dummies` method to add new binary features to the DataFrame.
#These binary features enable you to express the information in a numerical way, without implying any order. 
df_car = pd.get_dummies(df_car,columns=['drive-wheels']) 

In [66]:
df_car.head(5) # to get the first 5 rows of the dataset

Unnamed: 0,aspiration,num-of-doors,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd
0,std,two,four,2.0,4,False,False,True
1,std,two,four,2.0,4,False,False,True
2,std,two,six,2.0,6,False,False,True
3,std,four,four,4.0,4,False,True,False
4,std,four,five,4.0,5,True,False,False


The data in the aspiration column only has two values: std and turbo. You could encode this column into two binary features. However, you could also ignore the std value and record whether it's turbo or not. To do this, you would still use the get_dummies method, but specify drop_first as True. 

In [69]:
df_car['aspiration'].value_counts() 

aspiration
std      168
turbo     37
Name: count, dtype: int64

In [70]:
df_car = pd.get_dummies(df_car,columns=['aspiration']) # to create binary features for the aspiration column 

In [73]:
df_car.head(10) # to get the first 5 rows of the dataset 

Unnamed: 0,num-of-doors,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,aspiration_std,aspiration_turbo
0,two,four,2.0,4,False,False,True,True,False
1,two,four,2.0,4,False,False,True,True,False
2,two,six,2.0,6,False,False,True,True,False
3,four,four,4.0,4,False,True,False,True,False
4,four,five,4.0,5,True,False,False,True,False
5,two,five,2.0,5,False,True,False,True,False
6,four,five,4.0,5,False,True,False,True,False
7,four,five,4.0,5,False,True,False,True,False
8,four,five,4.0,5,False,True,False,False,True
9,two,five,2.0,5,True,False,False,False,True
