<a href="https://colab.research.google.com/github/NetoRibeiro/DATA8001Assignment1/blob/main/Linear_Regression_Model_01042021_005_Pickle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import and Load File

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import pandas as pd
from datetime import datetime as dt

import calendar
import re
import string

import math
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [2]:
df_processed = pd.read_csv('/content/drive/MyDrive/Data Science and Analytics/R00206995/data/R00206995_processed.csv')

In [3]:
df_processed.head()

Unnamed: 0,car_reg,purchase_date,year,month,county,make,model,type,colour,tax_band,price
0,202-C-2315,2020-07-01,2020,7,CORK,AUDI,A4,SALOON,RED,B,55287.0
1,191-C-3750,2019-01-20,2019,1,CORK,MAZDA,CX-30,SUV,SILVER,B,41690.0
2,191-L-3155,2019-03-21,2019,3,LIMERICK,BMW,3 SERIES,SALOON,WHITE,C,40381.0
3,191-D-2645,2019-01-26,2019,1,DUBLIN,AUDI,Q3,SUV,ORANGE,C,44836.0
4,181-W-341,2018-01-27,2018,1,WATERFORD,OPEL,ASTRA,HATCHBACK,RED,A,32188.0


###Transformation and Split

In [4]:
def get_feature_stats(df, list_columns):
    lis_of_string_features = []
    for feature in list_columns:
      if df[feature].dtype not in ['uint8', 'int16','int32','int64','float16','float32','float64']:
        lis_of_string_features.append(feature)
    return lis_of_string_features

In [5]:
# Create a list of columns to apply Dummies
list_of_best_features = ['make', 'model', 'county', 'type', 'tax_band']

# Apply Lower case before dummies
df_processed['county'] = df_processed['county'].str.lower()
df_processed['type'] = df_processed['type'].str.lower()
df_processed['tax_band'] = df_processed['tax_band'].str.lower()
df_processed['make'] = df_processed['make'].str.lower()
df_processed['model'] = df_processed['model'].str.lower()

# Apply Dammies
df_dummies = pd.get_dummies(df_processed, columns=list_of_best_features)
df_dummies.drop(['car_reg', 'purchase_date', 'month'], axis=1, inplace=True)
df_dummies.head(3)

Unnamed: 0,year,colour,price,make_audi,make_bmw,make_ford,make_mazda,make_mercedes,make_opel,make_toyota,model_2 series,model_3 series,model_5 series,model_7 series,model_a-class,model_a4,model_a6,model_amg-gt,model_astra,model_c-class,model_camery,model_corolla,model_corsa,model_crossland x,model_cx-30,model_cx-5,model_fiesta,model_focus,model_glc-class,model_grandland x,model_insignia,model_kuga,model_mazda3,model_mazda6,model_mondeo,model_mustang,model_prius,model_q3,model_q7,model_rav4,model_x3,model_yaris,county_cork,county_dublin,county_galway,county_limerick,county_waterford,type_coupe,type_estate,type_hatchback,type_saloon,type_suv,tax_band_a,tax_band_b,tax_band_c,tax_band_d,tax_band_e
0,2020,RED,55287.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,2019,SILVER,41690.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0
2,2019,WHITE,40381.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0


In [6]:
list_columns = df_dummies.columns
list_feature = get_feature_stats(df_dummies, list_columns)

In [7]:
df_train, df_test = train_test_split(df_dummies.copy() ,test_size=0.2, random_state=8001)

In [8]:
#Convert all string features to a int features
for feature in list_feature:
  label_condition = LabelEncoder()
  df_train[feature] = label_condition.fit_transform(df_train[feature])
  df_test[feature] = label_condition.fit_transform(df_test[feature])

In [9]:
df_train.shape, df_test.shape

((3040, 57), (760, 57))

In [10]:
# X variables (independent)
output_feature_dummies = 'price'
input_features_dummies = [item for item in df_train.columns if item not in output_feature_dummies]
print(input_features_dummies)

['year', 'colour', 'make_audi', 'make_bmw', 'make_ford', 'make_mazda', 'make_mercedes', 'make_opel', 'make_toyota', 'model_2 series', 'model_3 series', 'model_5 series', 'model_7 series', 'model_a-class', 'model_a4', 'model_a6', 'model_amg-gt', 'model_astra', 'model_c-class', 'model_camery', 'model_corolla', 'model_corsa', 'model_crossland x', 'model_cx-30', 'model_cx-5', 'model_fiesta', 'model_focus', 'model_glc-class', 'model_grandland x', 'model_insignia', 'model_kuga', 'model_mazda3', 'model_mazda6', 'model_mondeo', 'model_mustang', 'model_prius', 'model_q3', 'model_q7', 'model_rav4', 'model_x3', 'model_yaris', 'county_cork', 'county_dublin', 'county_galway', 'county_limerick', 'county_waterford', 'type_coupe', 'type_estate', 'type_hatchback', 'type_saloon', 'type_suv', 'tax_band_a', 'tax_band_b', 'tax_band_c', 'tax_band_d', 'tax_band_e']


In [11]:
# Apply Standar Scaler for a list of features
list_of_features_standard = ['year', 'colour']
for column in df_train.columns:

  if column in list_of_features_standard :
    scaler_column = StandardScaler()
    scaler_column.fit(df_train[column].values.reshape(-1, 1))
    df_train[column] = scaler_column.transform(df_train[column].values.reshape(-1, 1))
    df_test[column] = scaler_column.transform(df_test[column].values.reshape(-1, 1))
  
  else:
    print(f'skip:\t{column}')

skip:	price
skip:	make_audi
skip:	make_bmw
skip:	make_ford
skip:	make_mazda
skip:	make_mercedes
skip:	make_opel
skip:	make_toyota
skip:	model_2 series
skip:	model_3 series
skip:	model_5 series
skip:	model_7 series
skip:	model_a-class
skip:	model_a4
skip:	model_a6
skip:	model_amg-gt
skip:	model_astra
skip:	model_c-class
skip:	model_camery
skip:	model_corolla
skip:	model_corsa
skip:	model_crossland x
skip:	model_cx-30
skip:	model_cx-5
skip:	model_fiesta
skip:	model_focus
skip:	model_glc-class
skip:	model_grandland x
skip:	model_insignia
skip:	model_kuga
skip:	model_mazda3
skip:	model_mazda6
skip:	model_mondeo
skip:	model_mustang
skip:	model_prius
skip:	model_q3
skip:	model_q7
skip:	model_rav4
skip:	model_x3
skip:	model_yaris
skip:	county_cork
skip:	county_dublin
skip:	county_galway
skip:	county_limerick
skip:	county_waterford
skip:	type_coupe
skip:	type_estate
skip:	type_hatchback
skip:	type_saloon
skip:	type_suv
skip:	tax_band_a
skip:	tax_band_b
skip:	tax_band_c
skip:	tax_band_d
skip:	t

###Linear Regressor Model
####Standard Scaler plus Best Features Dummies

In [13]:
X_train = df_train[input_features_dummies].values
X_test = df_test[input_features_dummies].values

y_train = df_train[[output_feature_dummies]].values
y_test = df_test[[output_feature_dummies]].values

In [14]:
lr_scaler_model = LinearRegression()
lr_scaler_model.fit(X=X_train, y=y_train)
#lr_scaler_model_prodict = lr_scaler_model.predict(X=X_test)
#lr_scaler_model_mse = mean_squared_error(y_pred=lr_scaler_model_prodict, y_true=y_test)
#lr_scaler_model_r_sq = r2_score(y_pred=lr_scaler_model_prodict, y_true=y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

###Valitation Test:: Linear Regressor Model Standard Scaler and Best Features Dummies

In [15]:
#print(f'Linear Model StandardScaler Features Accuracy:\nRMSE={math.sqrt(lr_scaler_model_mse):,.0f}\nRSq={lr_scaler_model_r_sq:.2f} or {lr_scaler_model_r_sq*100:.0f}% of the variability in Y can be explained using X')

In [16]:
lr_scaler_model.coef_

array([[ 3.72835543e+02, -2.93646160e+02,  1.31884398e+15,
         2.82498341e+15,  2.68480241e+15,  3.90547838e+15,
         3.04557804e+15,  2.01660707e+15,  3.58547798e+14,
        -1.02337670e+14, -5.16377192e+14, -5.16377192e+14,
        -5.16377192e+14, -7.36971820e+14,  9.89762236e+14,
         9.89762236e+14, -7.36971820e+14,  2.91999154e+14,
        -7.36971820e+14,  1.95005842e+15,  1.95005842e+15,
         2.91999154e+14, -4.25199448e+14, -2.31407076e+15,
        -2.31407076e+15, -3.76196193e+14, -3.76196193e+14,
        -1.45417042e+15, -4.25199448e+14,  2.91999154e+14,
        -1.09339480e+15, -1.59687215e+15, -1.59687215e+15,
        -3.76196193e+14,  3.78433296e+13,  1.95005842e+15,
         2.72563634e+14,  2.72563634e+14,  1.23285982e+15,
        -1.23357579e+15,  1.95005842e+15,  1.78488761e+15,
         1.78488761e+15,  1.78488761e+15,  1.78488761e+15,
         1.78488761e+15,  7.51043177e+14,  1.16508270e+15,
         1.16508270e+15,  1.16508270e+15,  1.88228130e+1

In [17]:
lr_scaler_model.score

<bound method RegressorMixin.score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)>

In [19]:
lr_scaler_model.intercept_, lr_scaler_model.rank_

(array([-6.09548039e+15]), 44)

In [20]:
pickle.dump(lr_scaler_model, open('/content/drive/MyDrive/Data Science and Analytics/R00206995/model/model.pkl', 'wb') )