# **Data Preparation**

In [None]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
#reading the csv file using pandas
df = pd.read_csv('auto-mpg.csv', na_values='?', skipinitialspace=True)
df = df.drop('car name', axis=1)
data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data['cylinders']):
  strat_train_data = data.loc[train_index]
  strat_test_data = data.loc[test_index]

In [None]:
data = strat_train_data.drop('mpg', axis=1)
labels = strat_train_data['mpg'].copy()
data

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
145,4,83.0,61.0,2003,19.0,74,3
151,4,79.0,67.0,2000,16.0,74,2
388,4,156.0,92.0,2585,14.5,82,1
48,6,250.0,88.0,3139,14.5,71,1
114,4,98.0,90.0,2265,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108,15.5,74,2
156,8,400.0,170.0,4668,11.5,75,1
395,4,135.0,84.0,2295,11.6,82,1
14,4,113.0,95.0,2372,15.0,70,3


**Mapping origin column to country names**

In [None]:
data['origin'] = data['origin'].map({1: 'India', 2:'USA', 3:'Germany'})
data_transform = data.copy()
data_transform.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
145,4,83.0,61.0,2003,19.0,74,Germany
151,4,79.0,67.0,2000,16.0,74,USA
388,4,156.0,92.0,2585,14.5,82,India
48,6,250.0,88.0,3139,14.5,71,India
114,4,98.0,90.0,2265,15.5,73,USA


In [None]:
#separating the categorical variable
categorical_data = data_transform[['origin']]
categorical_data.head()

Unnamed: 0,origin
145,Germany
151,USA
388,India
48,India
114,USA


**One Hot Encoding of origin**

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()
data_cat_one_hot = one_hot_encoder.fit_transform(categorical_data)
data_cat_one_hot.toarray()[:5]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

**Handling Missing values using SimpleImputer**

In [None]:
numerical_data = data.iloc[:, :-1]
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     318 non-null    int64  
 1   displacement  318 non-null    float64
 2   horsepower    314 non-null    float64
 3   weight        318 non-null    int64  
 4   acceleration  318 non-null    float64
 5   model year    318 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 17.4 KB


In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
imputer.fit(numerical_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [None]:
#median of all the columns from imputer
imputer.statistics_

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [None]:
#imputing the missing values by transforming the dataframe
X = imputer.transform(numerical_data)
X

array([[   4. ,   83. ,   61. , 2003. ,   19. ,   74. ],
       [   4. ,   79. ,   67. , 2000. ,   16. ,   74. ],
       [   4. ,  156. ,   92. , 2585. ,   14.5,   82. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6,   82. ],
       [   4. ,  113. ,   95. , 2372. ,   15. ,   70. ],
       [   6. ,  146. ,  120. , 2930. ,   13.8,   81. ]])

In [None]:
# converting the 2D array back into a dataframe
data_transform = pd.DataFrame(X, columns=numerical_data.columns,
                          index=numerical_data.index)
data_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     318 non-null    float64
 1   displacement  318 non-null    float64
 2   horsepower    318 non-null    float64
 3   weight        318 non-null    float64
 4   acceleration  318 non-null    float64
 5   model year    318 non-null    float64
dtypes: float64(6)
memory usage: 17.4 KB


**Adding new attributes**

In [None]:
numerical_data.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year
145,4,83.0,61.0,2003,19.0,74
151,4,79.0,67.0,2000,16.0,74
388,4,156.0,92.0,2585,14.5,82
48,6,250.0,88.0,3139,14.5,71
114,4,98.0,90.0,2265,15.5,73


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]
    
attr_adder = CustomAttrAdder(acc_on_power=True)
data_tr_extra_attrs = attr_adder.transform(data_transform.values)

array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

**Creating pipeline for numerical attributes**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numerics = ['float64', 'int64']

numerical_data = data_transform.select_dtypes(include=numerics)

#imputing -> adding attributes -> scale them
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
    ])

num_data_tr = num_pipeline.fit_transform(numerical_data)
num_data_tr[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

**Transforming Numerical and Categorical Attributes**

In [None]:
##Transform different columns or subsets using ColumnTransformer
from sklearn.compose import ColumnTransformer

num_attrs = list(numerical_data)
cat_attrs = ["origin"]

#complete pipeline to transform both numerical and cat. attributes
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", OneHotEncoder(), cat_attrs),
    ])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])