In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from IPython.display import display
from sklearn import tree
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn.svm import SVC 
from sklearn import linear_model
from sklearn.externals import joblib 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import RandomizedSearchCV



In [3]:
final_df = pd.read_pickle('drive/My Drive/caseStudy1.pkl')

As per the hyper parameter tuning already performed, KNN Model worked the best after which Random forest showed a decent amount of MAPE and MAE errors and Decision Tree model worked fine too.
Reference: hyperparameterTuning.ipynb (Already Submitted)

Therefore further tuning the hyper params of these models and stacking them for giving the final predictions.

In [4]:
#Breaking down data into train and test 
y = final_df['FIRE_SIZE_CLASS'].values
x = final_df.drop(['FIRE_SIZE_CLASS'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y)

In [5]:
x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.33, stratify=y_train)

In [7]:
#Hyperparam tuning 2 for KNN
#Defining range for parameters
#Since neighbors = 20 and leafsize = 30 were the best hyperparams, going around these values to check for even better parameters
neighbors = [20, 25, 30]
leafSize = [30,40]

knn_model = KNeighborsClassifier(weights='distance')
parameters = {'n_neighbors': neighbors, 'leaf_size': leafSize}

hyperParam_Clf1 = RandomizedSearchCV(knn_model, parameters, cv = 3, scoring = 'accuracy',return_train_score= True)
hyperParam_Clf1.fit(x_cv, y_cv)



RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='distance'),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'leaf_size': [30, 40],
                                        'n_neighbors': [20, 25, 30]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=True, scoring='accuracy', verbose=0)

In [8]:
print(hyperParam_Clf1.best_params_)

{'n_neighbors': 30, 'leaf_size': 40}


In [9]:
knnModel = KNeighborsClassifier(n_neighbors = 30, leaf_size = 40).fit(x_train, y_train) 

In [12]:
#Saving this Model for final Analysis
joblib.dump(knnModel, 'drive/My Drive/CaseStudy1/knnpredictModel.pkl')

['drive/My Drive/CaseStudy1/knnpredictModel.pkl']

In [10]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [11]:
knn_predictions = knnModel.predict(x_test)  

#Fining accuracy of the Model
accuracy_train = knnModel.score(x_train, y_train) 
accuracy_test = knnModel.score(x_test, y_test) 

print('Train Data Accuracy is :',accuracy_train)
print('Test Data Accuracy is :',accuracy_test)

#Finding MAE
knn_MAE = mean_absolute_error(y_test, knn_predictions)
print('MAE value is: ', knn_MAE)

#Finding MAPE
knn_MAPE = mean_absolute_percentage_error(y_test, knn_predictions)
print('MAPE value is: ', knn_MAPE)

Train Data Accuracy is : 0.6507368445992371
Test Data Accuracy is : 0.6302384643399285
MAE value is:  0.43888203121726715
MAPE value is:  23.953104038986986


After tuning the hyper parameters further, there wasn't much change in the MAE and MAPE values, the values decreased by 0.01% only. Therefore further tuning isn't required.

In [36]:
#Trying more tuning for random Forest Model
maxdepth = [20, 40, 60]
estimators = [90, 100, 110]

rf_model = RandomForestClassifier(class_weight='balanced', verbose = 50)
parameters = {'max_depth': maxdepth, 'n_estimators':estimators}

hyperParam_Clf2 = RandomizedSearchCV(rf_model, parameters, cv = 3, scoring = 'accuracy',return_train_score= True)
hyperParam_Clf2.fit(x_cv, y_cv)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
building tree 9 of 110
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    3.1s remaining:    0.0s
building tree 10 of 110
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.4s remaining:    0.0s
building tree 11 of 110
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    3.8s remaining:    0.0s
building tree 12 of 110
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    4.2s remaining:    0.0s
building tree 13 of 110
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    4.5s remaining:    0.0s
building tree 14 of 110
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    4.9s remaining:    0.0s
building tree 15 of 110
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    5.3s remaining:    0.0s
building tree 16 of 110
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    5.7s remaining:    0.0s
building tree 17 of 110
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    6.0s remaining:    0.0s


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight='balanced',
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
         

In [37]:
#Getting the best hyperparams
print(hyperParam_Clf2.best_params_)

{'n_estimators': 110, 'max_depth': 40}


In [5]:
rfModel = RandomForestClassifier(n_estimators = 110, max_depth= 40, class_weight='balanced', verbose = 50).fit(x_train, y_train) 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 110
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s
building tree 2 of 110
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.5s remaining:    0.0s
building tree 3 of 110
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.6s remaining:    0.0s
building tree 4 of 110
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.7s remaining:    0.0s
building tree 5 of 110
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.6s remaining:    0.0s
building tree 6 of 110
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.7s remaining:    0.0s
building tree 7 of 110
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   14.9s remaining:    0.0s
building tree 8 of 110
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.0s remaining:    0.0s
building tree 9 of 110
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   19.1s remaining: 

In [7]:
joblib.dump(rfModel, 'drive/My Drive/CaseStudy1/rfpredictModel.pkl')

['drive/My Drive/CaseStudy1/rfpredictModel.pkl']

In [6]:
rf_predictions = rfModel.predict(x_test)  

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
#Finding MAE
rf_MAE = mean_absolute_error(y_test, rf_predictions)
print('MAE value is: ', rf_MAE)

#Finding MAPE
rf_MAPE = mean_absolute_percentage_error(y_test, rf_predictions)
print('MAPE value is: ', rf_MAPE)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

After tuning random forest further, the MAPE value has reduced by 4%. We will be using these parameters for final stacked model.

In [32]:
#Hyper param tuning for decision tree too
min_samples_split = [30,40,50,60]
max_features = ['sqrt', 'log2']

dt_model = DecisionTreeClassifier()
parameters = {'min_samples_split': min_samples_split, 'max_features':max_features }

hyperParam_Clf3 = RandomizedSearchCV(dt_model, parameters, cv = 3, scoring = 'accuracy',return_train_score= True)
hyperParam_Clf3.fit(x_cv, y_cv)



RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
          

In [33]:
print(hyperParam_Clf3.best_params_)

{'min_samples_split': 60, 'max_features': 'sqrt'}


In [9]:
dtModel = DecisionTreeClassifier(min_samples_split = 40, max_features = 'sqrt').fit(x_train, y_train) 

In [10]:
#Saving decision tree model to drive

joblib.dump(dtModel, 'drive/My Drive/CaseStudy1/dtpredictModel.pkl')

['drive/My Drive/CaseStudy1/dtpredictModel.pkl']

In [35]:
dt_predictions = dtModel.predict(x_test)  

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
#Finding MAE
dt_MAE = mean_absolute_error(y_test, dt_predictions)
print('MAE value is: ', dt_MAE)

#Finding MAPE
dt_MAPE = mean_absolute_percentage_error(y_test, dt_predictions)
print('MAPE value is: ', dt_MAPE)

MAE value is:  0.45498860695443105
MAPE value is:  24.996455548091664


Now after extensive hyer parameter tuning, the 3 models:
1. KNN (MAE: 0.438, MAPE: 23.953)
2. Decision Tree (MAE: 0.464, MAPE: 35.902)
3. Random Forest (MAE: 0.455, MAPE: 24.996)

are performing almost equally well, we can us these 3 model for our samples and can take the final result as per the majority vote.


**Part** **2**

Now using ensemble model to get the final predictions.

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, stratify=y)

In [7]:
x_D1, x_D2, y_D1, y_D2 = train_test_split(x_train, y_train, test_size=0.50, stratify=y_train)

In [None]:

for i in range(3):
  id_val = np.random.choice(np.arange(len(x_D1)), 50000, replace=True)
  sample_x = x_D1.iloc[id_val]
  sample_y = y_D1[id_val]
  #Perforing decision tree on sample 1
  if(i == 0):
    dtSampleModel = DecisionTreeClassifier(min_samples_split = 40, max_features = 'sqrt').fit(sample_x, sample_y) 
    joblib.dump(dtSampleModel, 'drive/My Drive/CaseStudy1/SampleModel_'+ str(i) + '.pkl')
  elif(i == 1):
    knnSampleModel = KNeighborsClassifier(n_neighbors = 30, leaf_size = 40).fit(sample_x, sample_y)
    joblib.dump(knnSampleModel, 'drive/My Drive/CaseStudy1/SampleModel_'+ str(i) + '.pkl')
  else:
    rfSampleModel = RandomForestClassifier(n_estimators = 110, max_depth= 40, class_weight='balanced', verbose = 50).fit(sample_x, sample_y)
    joblib.dump(rfSampleModel, 'drive/My Drive/CaseStudy1/SampleModel_'+ str(i) + '.pkl')



In [9]:
D2_df = pd.DataFrame() 

In [10]:
#Pass D2 through each of the model and get the predictions for it
for i in range(3):
  dt2SampleModel = joblib.load('drive/My Drive/CaseStudy1/SampleModel_'+ str(i) + '.pkl')
  predictedValues = dt2SampleModel.predict(x_D2)
  columnName = 'predict' + str(i)
  D2_df[columnName] = predictedValues


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

In [21]:
D2_df.head()

Unnamed: 0,predict0,predict1,predict2
0,2,1,1
1,2,2,2
2,2,2,1
3,2,2,1
4,2,2,2


In [14]:
#Deciding the final value from model using majority count.
finalPrediction = []
for i in range(len(D2_df)):
  row_list = D2_df.iloc[i].values.tolist()
  majority_count = max(set(row_list) , key=row_list.count)
  finalPrediction.append(majority_count)

In [15]:
finalPrediction = np.array(finalPrediction)

In [16]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [17]:
#Checking error values for train data now
#Finding MAE
train_MAE = mean_absolute_error(y_D2, finalPrediction)
print('Train MAE value is: ', train_MAE)

#Finding MAPE
train_MAPE = mean_absolute_percentage_error(y_D2, finalPrediction)
print('MAPE value is: ', train_MAPE)

Train MAE value is:  0.4515944194653982
MAPE value is:  24.302818401173063


After using different models in stacking, the MAE error has reduced by 0.005 and MAPE reduced by 0.2%

In [18]:
#Putting in test data now
def testDataPrediction(x_test):
  test_df = pd.DataFrame() 
  for i in range(3):
    SampleModel = joblib.load('drive/My Drive/CaseStudy1/SampleModel_'+ str(i) + '.pkl')
    predictedValues = SampleModel.predict(x_test)
    columnName = 'predict' + str(i)
    test_df[columnName] = predictedValues

  test_finalPrediction = []
  for j in range(len(test_df)):
    row_list = test_df.iloc[j].values.tolist()
    majority_count = max(set(row_list) , key=row_list.count)
    test_finalPrediction.append(majority_count)

  test_finalPrediction = np.array(test_finalPrediction)
  return(test_finalPrediction)

In [20]:
test_MAE = mean_absolute_error(y_test, testDataPrediction(x_test))
print('Test MAE value is: ', test_MAE)

#Finding MAPE
test_MAPE = mean_absolute_percentage_error(y_test, testDataPrediction(x_test))
print('Test MAPE value is: ', test_MAPE)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

Final values of MAE and MAPE for test data are:

MAE: 0.45 

MAPE: 24.3