# Data Analysis: lecture 2

## Data Preprocessing

<font color='green'>We are done with the missing values. let's quickly do that again</font>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('imports-85.data', header=None)
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df.columns = headers

# replace ? with nan
df.replace('?', np.nan, inplace=True)

#function to fill the missing value

def fillmissing_numeric(table, col_name):
    table[col_name] = table[col_name].astype('float64')
    table[col_name].fillna(table[col_name].mean(), inplace=True)
    
def fillmissing_object(table, col_name):
    table[col_name] = table[col_name].astype('object')
    table[col_name].fillna(table[col_name].value_counts().idxmax(), inplace=True)
    
# fill the numeric values
fillmissing_numeric(df, 'normalized-losses')
fillmissing_numeric(df, 'bore')
fillmissing_numeric(df, 'stroke')
fillmissing_numeric(df, 'horsepower')
fillmissing_numeric(df, 'peak-rpm')
fillmissing_numeric(df, 'stroke')

#fill the categorical values
fillmissing_object(df, 'num-of-doors')

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


**Step 1: Do the necessary data conversion**

In [4]:
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

Change the data types as required

In [5]:
df['price'] = df['price'].astype('float64')

**Step 2: Change the unit conversion, where necessary** [Not relevant here]

**Step 3: Data Scaling**

Reson: If your data has very different ranges, then data with higher numerical value can be unfairly prioritized by the machine learning algorithm

For example, the "curb-weight", "stroke" and "length" has very different ranges of values

In [6]:
df[["curb-weight", "stroke", "length"]]

Unnamed: 0,curb-weight,stroke,length
0,2548,2.68,168.8
1,2548,2.68,168.8
2,2823,3.47,171.2
3,2337,3.40,176.6
4,2824,3.40,176.6
...,...,...,...
196,2952,3.15,188.8
197,3049,3.15,188.8
198,3012,2.87,188.8
199,3217,3.40,188.8


<font color = 'red'>So, based on these, the machine learning algorithm may impose curb-weight highest weight, then length and then the stroke. But this is not necessarily true!</font>

There are 3 ways how we can do the data scaling:

1. Simple feature scaling. Divide all the data of a column with the maximum value of the column. Scale the data between 0 to 1
2. Min-max scaling. Subtract the minumum value from the old value and divide it by the range. Scale the data between 0 to 1
3. Standard Z-normalization. Subtract mean from old value and divide it by standard deviation. Makes the average 0 and standard deviation 1. 

![](normlize.PNG)

In [8]:
col_dtypes = df.dtypes.tolist()
numeric_cols = [df.columns[idx] for idx, col_type in enumerate(col_dtypes) if col_type=='int64' or col_type=='float64']

In [9]:
df_numerics = df[numeric_cols]
df_numerics.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


**We donot want any of the scaling for the price column**

In [10]:
df_numerics.drop('price', axis=1, inplace=True)
df_numerics.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22


In [11]:
# Simple feature scaling
df_numeric_scaled_1 = df_numerics/df_numerics.max()
df_numeric_scaled_1.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,1.0,0.476562,0.732837,0.811148,0.890278,0.816054,0.62666,0.398773,0.880711,0.642686,0.391304,0.423664,0.757576,0.428571,0.5
1,1.0,0.476562,0.732837,0.811148,0.890278,0.816054,0.62666,0.398773,0.880711,0.642686,0.391304,0.423664,0.757576,0.428571,0.5
2,0.333333,0.476562,0.781638,0.822681,0.909722,0.876254,0.694294,0.466258,0.680203,0.832134,0.391304,0.587786,0.757576,0.387755,0.481481
3,0.666667,0.640625,0.825476,0.84863,0.919444,0.908027,0.574766,0.334356,0.809645,0.815348,0.434783,0.389313,0.833333,0.489796,0.555556
4,0.666667,0.640625,0.822167,0.84863,0.922222,0.908027,0.69454,0.417178,0.809645,0.815348,0.347826,0.438931,0.833333,0.367347,0.407407


In [12]:
# Min-max scaling
df_numeric_scaled_2 = (df_numerics-df_numerics.min())/(df_numerics.max() - df_numerics.min())
df_numeric_scaled_2.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,1.0,0.298429,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,0.294393,0.346939,0.222222,0.289474
1,1.0,0.298429,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,0.294393,0.346939,0.222222,0.289474
2,0.6,0.298429,0.230321,0.449254,0.444444,0.383333,0.517843,0.343396,0.1,0.666667,0.125,0.495327,0.346939,0.166667,0.263158
3,0.8,0.518325,0.38484,0.529851,0.504274,0.541667,0.329325,0.181132,0.464286,0.633333,0.1875,0.252336,0.55102,0.305556,0.368421
4,0.8,0.518325,0.373178,0.529851,0.521368,0.541667,0.518231,0.283019,0.464286,0.633333,0.0625,0.313084,0.55102,0.138889,0.157895


In [13]:
# Z-score scaling
df_numeric_scaled_3 = (df_numerics-df_numerics.mean())/(df_numerics.std())
df_numeric_scaled_3.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,1.720753,0.0,-1.68091,-0.438315,-0.851335,-2.029015,-0.014821,0.075201,0.519668,-1.825276,-0.290709,0.203247,-0.246103,-0.650624,-0.540937
1,1.720753,0.0,-1.68091,-0.438315,-0.851335,-2.029015,-0.014821,0.075201,0.519668,-1.825276,-0.290709,0.203247,-0.246103,-0.650624,-0.540937
2,0.126876,0.0,-0.708334,-0.243544,-0.185134,-0.558319,0.516789,0.604724,-2.427303,0.674347,-0.290709,1.354035,-0.246103,-0.961995,-0.687669
3,0.923814,1.312654,0.165335,0.19469,0.147966,0.217881,-0.42271,-0.430252,-0.524828,0.452862,-0.041019,-0.037616,0.799673,-0.183569,-0.100741
4,0.923814,1.312654,0.099398,0.19469,0.243137,0.217881,0.518722,0.219617,-0.524828,0.452862,-0.540399,0.310297,0.799673,-1.11768,-1.274597


In [14]:
df_numeric_scaled_3.mean()

symboling            1.701237e-16
normalized-losses   -2.540809e-17
wheel-base          -1.144911e-14
length              -1.634072e-14
width                1.436772e-14
height              -1.983930e-14
curb-weight          2.640232e-16
engine-size         -5.468263e-17
bore                -3.031296e-15
stroke               1.928502e-14
compression-ratio   -4.536172e-16
horsepower           3.327907e-16
peak-rpm            -8.224488e-16
city-mpg             4.032153e-17
highway-mpg         -1.673620e-16
dtype: float64

In [15]:
df_numeric_scaled_3.std()

symboling            1.0
normalized-losses    1.0
wheel-base           1.0
length               1.0
width                1.0
height               1.0
curb-weight          1.0
engine-size          1.0
bore                 1.0
stroke               1.0
compression-ratio    1.0
horsepower           1.0
peak-rpm             1.0
city-mpg             1.0
highway-mpg          1.0
dtype: float64

<font color = 'red'>The easier or more efficient way to do that is using the sklearn library</font>

In [None]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

In [None]:
## simple feature scaling
scale = MaxAbsScaler()
scaled_fitted = scale.fit_transform(df_numerics)
df_numeric_scaled_1_library = pd.DataFrame(scaled_fitted, columns=df_numerics.columns)

## Min-mx feature scaling
scale = MinMaxScaler()
scaled_fitted = scale.fit_transform(df_numerics)
df_numeric_scaled_2_library = pd.DataFrame(scaled_fitted, columns=df_numerics.columns)

## z-normalized feature scaling
scale = StandardScaler()
scaled_fitted = scale.fit_transform(df_numerics)
df_numeric_scaled_3_library = pd.DataFrame(scaled_fitted, columns=df_numerics.columns)

**Step 4: Work with categorical variables**

The way to convert categorical to numerical variable is to use dummy variable

In [16]:
categorical_cols = [df.columns[idx] for idx, col_type in enumerate(col_dtypes) if col_type=='O']
categorical_cols

['make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'engine-type',
 'num-of-cylinders',
 'fuel-system']

In [17]:
pd.get_dummies(df['make'])

Unnamed: 0,alfa-romero,audi,bmw,chevrolet,dodge,honda,isuzu,jaguar,mazda,mercedes-benz,...,nissan,peugot,plymouth,porsche,renault,saab,subaru,toyota,volkswagen,volvo
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
## One-hot-encoded

In [18]:
pd.get_dummies(df[categorical_cols])

Unnamed: 0,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,...,num-of-cylinders_twelve,num-of-cylinders_two,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
1 4 7 14 99
0 0 0 1 0
1 0 0 0 0
1 0 0 0 0
0 0 0 0 1