# End To End Machine Learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preprocessing

In [2]:
data = pd.read_csv('..\data\car-sales-extended-missing-data.csv')
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


check data type and missing value

In [3]:
print(data.dtypes) 
print(data.isna().sum())

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object
Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64


Let's drop Price(Lebels) 50 rows that missing 

In [4]:
data.dropna(subset=['Price'], axis=0, inplace=True)
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

> some data have overlaping then some rows droped.

Before we handle with missing value or scaling the data we must split to Train and Test set because **we dont' want estimators see the data before predict i.e. data leaked**.

In [5]:
from sklearn.model_selection import train_test_split

X = data.drop('Price', axis=1)
y = data['Price'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

### Feature Scaling
* Normalization (also called min-max scaling) - This rescales all the numerical values to between 0 and 1
* Standardization - This subtracts the mean value from all of the features (so the resulting features have 0 mean)

making sure all of your numerical data is on the same scale,  A machine learning algorithm may have trouble finding patterns in these wide-ranging variables.

> Feature scaling usually isn't required for your target variable.

### Handling text and Categorical attributes
convert from text to number ScikitLearn's OrdinalEncoder or OneHotEncoder

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

"""
fill missing cat_artribs with most_frequent 
num artrib with mean
I want to change 'Make' column from OrdinalEncoder
and other columns with OneHotEncoder
and scaling numerical data
"""

make_artrib = ['Make']
cat_artribs = ['Colour', 'Doors']
num_artribs = ['Odometer (KM)']

make_pipe = Pipeline([ 
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder())
])

cat_pipe = Pipeline([ 
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

num_pipe = Pipeline([ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

full_pipeline = ColumnTransformer([ 
    ('make_artribs',make_pipe,make_artrib),
    ('cat_artribs',cat_pipe,cat_artribs),
    ('num_artribs',num_pipe,num_artribs),
], remainder='passthrough')

X_train_transformed = full_pipeline.fit_transform(X_train)
X_test_transformed = full_pipeline.transform(X_test)

In [7]:
X_train[:2]

Unnamed: 0,Make,Colour,Odometer (KM),Doors
986,Honda,White,71934.0,4.0
297,Toyota,Red,162665.0,4.0


In [8]:
X_train_transformed[:2]

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 0.25775097],
       [3.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        , 0.        , 0.636251  ]])

> alrigth we transform the data, **fit_transform** is we fit the scale from training set and transfrom that scale to testing set.

## Select and Train Model

In [9]:
from sklearn.ensemble import RandomForestRegressor

rdf = RandomForestRegressor()
rdf.fit(X_train_transformed, y_train)
print('R^2 of training set = {:.4f}'.format(rdf.score(X_train_transformed, y_train)))
print('R^2 of testing set = {:.4f}'.format(rdf.score(X_test_transformed, y_test)))

R^2 of training set = 0.8818
R^2 of testing set = 0.2078


> our model kinda overfitting : **overfit with training set and perform bad with testing set**

> the score is R^2 (coefficient determination) on Regression model : with range -inf to 1.0 , if model predict the mean of the target score would be 0.

In [10]:
from sklearn.metrics import mean_absolute_error

y_preds = rdf.predict(X_test_transformed)
print('mae of testing set = {}'.format(mean_absolute_error(y_test, y_preds)))

mae of testing set = 5818.1002340152545


In [11]:
np.sum(np.abs(y_test - y_preds)) / len(y_test)

5818.1002340152545

> mean_absolute_error : sum of | Ytrue - Ypred | / len(y_true)

In [12]:
from sklearn.model_selection import cross_val_score

np.random.seed(42)
rdf = RandomForestRegressor()
score = cross_val_score(rdf, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print(-score)
print(np.mean(-score))

[6101.20436431 6060.62646583 6452.71631988 5169.95304393 6051.76696006]
5967.253430801107


> *cross val score* : seperate data to n (cv=n) and train the others, ex. cv=5, 1st train 1-4 predict on 5, 2nd 2-5 predict on 1 ...., score is R^2 on regression and mean accurracy on classifier.

> The fitting will be done inside the cross_val_score function, you don't need to worry about this beforehand.

### Hyperparameter tuning
* RandomizedSearchCV : Random hyperparams with n_iter estimators
* GridSearchCV : fit all hyperparams that we set

In [13]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators':[10,50,100,150],
    'max_depth':[None, 10,30],
    'max_features':['auto', 'sqrt']
}

np.random.seed(42)
clf = RandomForestRegressor(n_jobs=-1) # -1 = use all of CPU
Rs_clf = RandomizedSearchCV(clf, params,n_iter=20, cv=5,verbose=0) # fit 20 hyparams with each 5 times
Rs_clf.fit(X_train_transformed, y_train);

In [14]:
Rs_clf.best_params_, Rs_clf.best_score_

({'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 10},
 0.28835162740476894)

We can get score or make prediction.

In [15]:
Rs_clf.score(X_test_transformed, y_test)

0.18336812161358862

In [16]:
from sklearn.model_selection import GridSearchCV

np.random.seed(42)
clf = RandomForestRegressor(n_jobs=-1)
Gs_clf = GridSearchCV(clf,params,cv=5,verbose=0)
Gs_clf.fit(X_train_transformed, y_train);

In [17]:
Gs_clf.best_params_, Gs_clf.best_score_

({'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 50},
 0.28819863533008794)

## Saving and Loading models
* pickle
* joblib

when handle with a large data **Joblib** is perform better.

In [18]:
import pickle

# save model and pipeline
pickle.dump((rdf,full_pipeline), open('model_1.pkl', 'wb')) # write binary
rdf.score(X_test_transformed, y_test)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
loaded_model = pickle.load(open('model_1.pkl', 'rb')) # read binary
loaded_model[0].score(X_test_transformed, y_test)

0.19986344207016027

In [None]:
X_test.iloc[0]

Make              Toyota
Colour              Blue
Odometer (KM)    99761.0
Doors                4.0
Name: 203, dtype: object

In [None]:
X_test_transformed[2]

array([1.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.78225537])

In [None]:
loaded_model[1].transform(X_test)[2]

array([1.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.78225537])

In [None]:
from joblib import dump, load

dump((rdf, full_pipeline),'model_1.joblib')

['model_1.joblib']

In [None]:
loaded_model_2 = load('model_1.joblib')
loaded_model_2[0].score(X_test_transformed, y_test)

0.19986344207016027

In [None]:
loaded_model_2[1].transform(X_test)[2]

array([1.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.78225537])