# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(color_codes=True)

In [None]:
df = pd.read_csv("/content/data_machinera.csv")

In [None]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Handling Missing and Duplicate Data

In [None]:
df = df.drop_duplicates()

In [None]:
df = df.dropna() 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7735 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               7735 non-null   object 
 1   Model              7735 non-null   object 
 2   Year               7735 non-null   int64  
 3   Engine Fuel Type   7735 non-null   object 
 4   Engine HP          7735 non-null   float64
 5   Engine Cylinders   7735 non-null   float64
 6   Transmission Type  7735 non-null   object 
 7   Driven_Wheels      7735 non-null   object 
 8   Number of Doors    7735 non-null   float64
 9   Market Category    7735 non-null   object 
 10  Vehicle Size       7735 non-null   object 
 11  Vehicle Style      7735 non-null   object 
 12  highway MPG        7735 non-null   int64  
 13  city mpg           7735 non-null   int64  
 14  Popularity         7735 non-null   int64  
 15  MSRP               7735 non-null   int64  
dtypes: float64(3), int64(5)

# Extract Dependent and Independent variables

>Independent variables (also referred to as Features) are the input for a process that is being analyzes. Dependent variables are the output of the process.

>For example, in the below data set, the independent variables are the input of the purchasing process being analyzed. The result (whether a user purchased or not) is the dependent variable.

* [Splitting data in dependent and independent variables](https://www.pluralsight.com/guides/importing-and-splitting-data-into-dependent-and-independent-features-for-ml)

In [None]:
df["Market Category"].unique().size

70

In [None]:
df["Engine Fuel Type"].unique().size

8

In [None]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [None]:
df.drop(columns = ["Market Category", "Engine Fuel Type"], inplace = True)

In [None]:
#extracting all of independent variables(inputs) into X
X = df.iloc[:, 1:-1]

In [None]:
#extracting dependent variable(outputs) in y
y = df.iloc[:, -1]

# Encoding Categorical Data
---
###Label Encoding

When we give our Dataset to model we need to have all numerical data into it, this implies that we have to eliminate all the string type data. This is achieved by Label encoding.

In label encoding all the categories are enlisted in a numpy array and the index of that array is used to replace all the categories in that column.

`preprocessing.LabelEncoder()` of sklearn is used for label encoding

---
###One Hot Encoding

The main drawback of Label encoder is that it introduces hierarchy in the categorical data which is unwanted most of the times.

To overcome this drawback one hot encoding is used.

What one hot encoding does is, it takes a column which has categorical data, which has been label encoded, and then splits the column into multiple columns. The numbers are replaced by 1s and 0s, depending on which column has what value. 

Further reading -
* [Feature Engineering](https://www.youtube.com/watch?v=6WDFfaYtN6s)
* [Encoding Data](https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/)

In [None]:
#getting unique values present in 'vehicle style' feature
X["Vehicle Style"].unique()

array(['Coupe', 'Convertible', 'Sedan', 'Wagon', '4dr Hatchback',
       '2dr Hatchback', '4dr SUV', 'Crew Cab Pickup',
       'Extended Cab Pickup', 'Regular Cab Pickup', 'Passenger Minivan',
       'Cargo Minivan', '2dr SUV', 'Convertible SUV', 'Cargo Van',
       'Passenger Van'], dtype=object)

In [None]:
X["Driven_Wheels"].unique()

array(['rear wheel drive', 'front wheel drive', 'all wheel drive',
       'four wheel drive'], dtype=object)

In [None]:
X["Transmission Type"].unique()

array(['MANUAL', 'AUTOMATIC', 'AUTOMATED_MANUAL', 'DIRECT_DRIVE',
       'UNKNOWN'], dtype=object)

In [None]:
X["Vehicle Size"].unique()

array(['Compact', 'Midsize', 'Large'], dtype=object)

In [None]:
#getting number of unique features present in 'vehicle size'
X["Vehicle Size"].unique().shape

(3,)

In [None]:
X["Model"].unique().shape

(713,)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoderOne = LabelEncoder()

#here encoderOne transforms both the attributes
X["Vehicle Style"] = encoderOne.fit_transform(X["Vehicle Style"])
X["Model"] = encoderOne.fit_transform(X["Model"])

In [None]:
X.head()

Unnamed: 0,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity
0,1,2011,335.0,6.0,MANUAL,rear wheel drive,2.0,Compact,8,26,19,3916
1,0,2011,300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,6,28,19,3916
2,0,2011,300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,8,28,20,3916
3,0,2011,230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,8,28,18,3916
4,0,2011,230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,6,28,18,3916


In [None]:
from sklearn.preprocessing import OneHotEncoder

encoderTwo = OneHotEncoder(sparse=False)

#dataframe 
encodedTT = pd.DataFrame(encoderTwo.fit_transform(X[["Transmission Type", "Vehicle Size"]]))

In [None]:
encodedTT.columns = encoderTwo.get_feature_names(["Transmission Type", "Vehicle Size"]) 

In [None]:
encodedTT.head()

Unnamed: 0,Transmission Type_AUTOMATED_MANUAL,Transmission Type_AUTOMATIC,Transmission Type_DIRECT_DRIVE,Transmission Type_MANUAL,Transmission Type_UNKNOWN,Vehicle Size_Compact,Vehicle Size_Large,Vehicle Size_Midsize
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
X = pd.get_dummies(X, columns=["Driven_Wheels", "Transmission Type", "Vehicle Size"], drop_first=True)

In [None]:
X.head()

Unnamed: 0,Model,Year,Engine HP,Engine Cylinders,Number of Doors,Vehicle Style,highway MPG,city mpg,Popularity,Driven_Wheels_four wheel drive,Driven_Wheels_front wheel drive,Driven_Wheels_rear wheel drive,Transmission Type_AUTOMATIC,Transmission Type_DIRECT_DRIVE,Transmission Type_MANUAL,Transmission Type_UNKNOWN,Vehicle Size_Large,Vehicle Size_Midsize
0,1,2011,335.0,6.0,2.0,8,26,19,3916,0,0,1,0,0,1,0,0,0
1,0,2011,300.0,6.0,2.0,6,28,19,3916,0,0,1,0,0,1,0,0,0
2,0,2011,300.0,6.0,2.0,8,28,20,3916,0,0,1,0,0,1,0,0,0
3,0,2011,230.0,6.0,2.0,8,28,18,3916,0,0,1,0,0,1,0,0,0
4,0,2011,230.0,6.0,2.0,6,28,18,3916,0,0,1,0,0,1,0,0,0


# Splitting dataset into training and test set

We use train-test-split module from sklearn to divide our dataset in training and testing parts. By doing this we allow our model to learn from larger chunk of data and then we can test its accuracy using smaller chunk of data.

We Import train-test-split by following command -

`from sklearn.model_selection import train_test_split`

When splitting a dataset there are some competing concerns:
* If you have less training data, your parameter estimates have greater variance.
* And if you have less testing data, your performance statistic will have greater variance.
* The data should be divided in such a way that neither of them is too high, which is more dependent on the amount of data you have. 
* If your data is too small then no split will give you satisfactory variance so you will have to do cross-validation. 
* If your data is huge then it doesn’t really matter whether you choose an 80:20 split or a 90:10 split 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6188, 18) (1547, 18) (6188,) (1547,)



# Feature scaling

Feature scaling refers to putting the values in the same range or same scale so that no variable is dominated by the other.

Numerical data in the dataset can have varied range i.e. one parameter may lie between 1 to 10 for all records whereas another parameter can lie between 1000 to 5000. Though data is logically correct but after passing to particular algorithm, the features with higher magnitude become key parameters for that algorithm.

To avoid such situations feature scaling is performed using some statistical techniques like Min-Max scaling & Mean normalization. This creates a common range for all the parameters and thus removes Algorithmic bias.

References -

* [Standardization and Normalization](https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/)

* [Difference between Standardization and Normalization](https://www.youtube.com/watch?v=mnKm3YP56PY)

### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

In [None]:
pd.DataFrame(X_train_scaler)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.822678,0.620794,1.900154,1.132787,-1.573338,0.096090,-0.216847,-0.615204,-0.694116,-0.301936,-0.718537,-0.659595,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,1.244849
1,-1.628071,0.779914,0.188755,0.086487,0.651737,1.296103,0.508320,-0.067884,-0.350775,-0.301936,-0.718537,1.516083,0.622950,-0.0422,-0.508168,-0.012713,1.787527,-0.803310
2,-0.554214,-0.174806,-0.453020,0.086487,0.651737,-0.903921,-0.796980,-0.615204,-0.752617,-0.301936,1.391717,-0.659595,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,1.244849
3,-1.642201,0.461674,-0.803857,-0.959812,0.651737,1.296103,1.813620,1.163585,-0.648152,-0.301936,1.391717,-0.659595,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,-0.803310
4,-0.087934,-0.015686,-0.957883,-0.959812,0.651737,-0.903921,-0.216847,-0.067884,-0.721277,-0.301936,-0.718537,1.516083,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,-0.803310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6183,0.533773,0.461674,1.215594,0.086487,0.651737,1.296103,0.073220,-0.341544,0.138122,-0.301936,-0.718537,1.516083,-1.605266,-0.0422,-0.508168,-0.012713,1.787527,-0.803310
6184,-0.318719,0.620794,1.181366,1.132787,0.651737,-0.903921,-1.812214,-0.888864,-0.626562,3.311956,-0.718537,-0.659595,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,1.244849
6185,-0.883907,-1.288645,-0.838085,-0.959812,-1.573338,-1.503927,0.508320,0.205776,0.358195,-0.301936,1.391717,-0.659595,-1.605266,-0.0422,1.967852,-0.012713,-0.559432,-0.803310
6186,-0.558924,0.620794,-1.197479,-0.959812,0.651737,-0.903921,0.943420,0.753095,-0.948315,-0.301936,1.391717,-0.659595,0.622950,-0.0422,-0.508168,-0.012713,-0.559432,-0.803310


In [None]:
pd.DataFrame(X_test_scaler)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.358920,0.773240,-0.239977,-0.977660,0.625928,-0.951344,0.031223,0.526783,-0.612749,-0.288675,-0.709508,-0.674770,0.578594,-0.050915,-0.482358,-0.035979,-0.560677,1.229375
1,-0.817886,0.606823,2.223958,1.208737,-1.649773,0.064903,-0.157088,-0.630587,-0.698368,-0.288675,-0.709508,-0.674770,0.578594,-0.050915,-0.482358,-0.035979,-0.560677,1.229375
2,0.170295,0.107574,-1.167363,-0.977660,0.625928,-1.154594,0.502001,0.671454,-0.758302,-0.288675,1.409428,-0.674770,0.578594,-0.050915,-0.482358,-0.035979,-0.560677,-0.813421
3,-1.309634,0.606823,2.942227,1.208737,0.625928,1.284400,-0.251244,-0.630587,1.724650,-0.288675,-0.709508,-0.674770,0.578594,-0.050915,-0.482358,-0.035979,1.783559,-0.813421
4,1.411375,0.440407,1.396585,1.208737,-1.649773,0.064903,-0.533711,-0.775258,-0.884590,-0.288675,-0.709508,1.481985,-1.728327,-0.050915,-0.482358,-0.035979,-0.560677,-0.813421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,-0.761686,0.440407,1.623885,1.208737,-1.649773,-0.341596,0.219534,-0.341245,-0.081198,-0.288675,-0.709508,1.481985,-1.728327,-0.050915,2.073149,-0.035979,-0.560677,-0.813421
1543,-1.112935,0.440407,-0.603658,-0.977660,-1.649773,-0.341596,0.407845,0.526783,-0.446506,-0.288675,1.409428,-0.674770,-1.728327,-0.050915,2.073149,-0.035979,-0.560677,-0.813421
1544,1.102275,0.773240,0.078243,0.115538,-1.649773,1.081150,-0.439555,-0.341245,-0.081198,3.464102,-0.709508,-0.674770,0.578594,-0.050915,-0.482358,-0.035979,1.783559,-0.813421
1545,-1.609367,-2.222257,-1.049167,0.115538,-1.649773,-1.561092,-0.439555,-0.341245,-0.758302,-0.288675,1.409428,-0.674770,-1.728327,-0.050915,2.073149,-0.035979,-0.560677,-0.813421


### Normalization

In [None]:
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.fit_transform(X_test)

In [None]:
pd.DataFrame(X_train_minmax)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.259831,0.962963,0.470402,0.500,0.0,0.533333,0.131313,0.061538,0.091600,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.019663,1.000000,0.258985,0.375,1.0,0.933333,0.181818,0.092308,0.178780,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.339888,0.777778,0.179704,0.375,1.0,0.200000,0.090909,0.061538,0.076746,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.015449,0.925926,0.136364,0.250,1.0,0.933333,0.272727,0.161538,0.103271,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.478933,0.814815,0.117336,0.250,1.0,0.200000,0.131313,0.092308,0.084704,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6183,0.664326,0.925926,0.385835,0.375,1.0,0.933333,0.151515,0.076923,0.302918,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6184,0.410112,0.962963,0.381607,0.500,1.0,0.200000,0.020202,0.046154,0.108753,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6185,0.241573,0.518519,0.132135,0.250,0.0,0.000000,0.181818,0.107692,0.358798,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6186,0.338483,0.962963,0.087738,0.250,1.0,0.200000,0.212121,0.138462,0.027056,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd.DataFrame(X_test_minmax)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.398031,1.000000,0.200213,0.250,1.0,0.200000,0.043860,0.125000,0.103467,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.260197,0.962963,0.488818,0.500,0.0,0.533333,0.038012,0.058333,0.082023,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.556962,0.851852,0.091587,0.250,1.0,0.133333,0.058480,0.133333,0.067012,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.112518,0.962963,0.572950,0.500,1.0,0.933333,0.035088,0.058333,0.688885,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.929677,0.925926,0.391906,0.500,0.0,0.533333,0.026316,0.050000,0.035382,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,0.277075,0.925926,0.418530,0.500,0.0,0.400000,0.049708,0.075000,0.236598,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1543,0.171589,0.925926,0.157614,0.250,0.0,0.400000,0.055556,0.125000,0.145104,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1544,0.836850,1.000000,0.237487,0.375,0.0,0.866667,0.029240,0.075000,0.236598,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1545,0.022504,0.333333,0.105431,0.375,0.0,0.000000,0.029240,0.075000,0.067012,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
