In [31]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from sklearn.ensemble import VotingClassifier

import joblib

In [32]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [33]:
# X_train_origin = X_train_origin.to_numpy()
# y_train_origin = y_train_origin.to_numpy()
# y_train_origin = y_train_origin.ravel()

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.25, random_state=1) 

In [35]:
X_train_origin.shape,y_train_origin.shape,X_train.shape, X_val.shape, y_train.shape, y_val.shape

((12096, 50), (12096, 1), (9072, 50), (3024, 50), (9072, 1), (3024, 1))

In [36]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    
# Some algorithms (such as SGD classifiers, Random Forest classifiers, and naive Bayes classifiers)
# are capable of handling multiple classes natively. 
# Others (such as Logistic Regression or Support Vector Machine classifiers) are strictly binary classifiers.

### Baseline

In [37]:
modeling(KNN)
# accept

  This is separate from the ipykernel package so we can avoid doing imports until


Confusion_Matrix:
[[270  93   2   0  13   4  38]
 [ 98 228  12   0  63  19  13]
 [  0   2 301  35  14  87   1]
 [  0   0  14 402   0   9   0]
 [ 12  28   5   0 369  10   0]
 [  3   7  59  20  13 336   0]
 [ 30   8   0   0   0   0 406]]
              precision    recall  f1-score   support

           1       0.65      0.64      0.65       420
           2       0.62      0.53      0.57       433
           3       0.77      0.68      0.72       440
           4       0.88      0.95      0.91       425
           5       0.78      0.87      0.82       424
           6       0.72      0.77      0.74       438
           7       0.89      0.91      0.90       444

    accuracy                           0.76      3024
   macro avg       0.76      0.76      0.76      3024
weighted avg       0.76      0.76      0.76      3024

Accuracy:0.7645502645502645


In [38]:
lr = LR(multi_class='multinomial', solver='newton-cg') # only one model will be trained, 
                                                        # activition func:softmax
                                                        # actually, this is softmax regression
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# softmax is better than ovr
# reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[273  86   0   0  20   0  41]
 [107 223   5   0  79  15   4]
 [  0   3 252  45  13 127   0]
 [  0   0  33 371   0  21   0]
 [  7  66   3   0 325  23   0]
 [  0   3 108  33  20 274   0]
 [ 60   0   0   0   1   0 383]]
              precision    recall  f1-score   support

           1       0.61      0.65      0.63       420
           2       0.59      0.52      0.55       433
           3       0.63      0.57      0.60       440
           4       0.83      0.87      0.85       425
           5       0.71      0.77      0.74       424
           6       0.60      0.63      0.61       438
           7       0.89      0.86      0.88       444

    accuracy                           0.69      3024
   macro avg       0.69      0.70      0.69      3024
weighted avg       0.69      0.69      0.69      3024

Accuracy:0.6947751322751323


In [39]:
lr2 = LR(multi_class='ovr', solver='newton-cg') # one-vs-rest, 7 models will be trained
                                                # activition:sigmoid
lr2.fit(X_train, y_train)
y_pred = lr2.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[258  84   1   0  31   1  45]
 [107 206   6   0  92  18   4]
 [  0  10 241  43  16 130   0]
 [  0   0  32 375   0  18   0]
 [ 12  59  21   0 306  26   0]
 [  2   6  96  43  50 241   0]
 [ 63   4   1   0   0   0 376]]
              precision    recall  f1-score   support

           1       0.58      0.61      0.60       420
           2       0.56      0.48      0.51       433
           3       0.61      0.55      0.58       440
           4       0.81      0.88      0.85       425
           5       0.62      0.72      0.67       424
           6       0.56      0.55      0.55       438
           7       0.88      0.85      0.87       444

    accuracy                           0.66      3024
   macro avg       0.66      0.66      0.66      3024
weighted avg       0.66      0.66      0.66      3024

Accuracy:0.6623677248677249


In [40]:
svm = SVC(decision_function_shape='ovo') 
# one-vs-one, C(7,2) models will be trained, then vote, 
# ovo is preferable for SVM                                 
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[284  91   0   0  14   0  31]
 [ 95 255  10   0  51  17   5]
 [  0   2 288  36   8 106   0]
 [  0   0   9 402   0  14   0]
 [  4  28   9   0 368  15   0]
 [  0   8  67  18   5 340   0]
 [ 41   0   0   0   0   0 403]]
              precision    recall  f1-score   support

           1       0.67      0.68      0.67       420
           2       0.66      0.59      0.62       433
           3       0.75      0.65      0.70       440
           4       0.88      0.95      0.91       425
           5       0.83      0.87      0.85       424
           6       0.69      0.78      0.73       438
           7       0.92      0.91      0.91       444

    accuracy                           0.77      3024
   macro avg       0.77      0.77      0.77      3024
weighted avg       0.77      0.77      0.77      3024

Accuracy:0.7738095238095238


In [41]:
svm2 = SVC(decision_function_shape='ovr') # one-vs-rest, 7 models will be trained                                             
svm2.fit(X_train, y_train)
y_pred = svm2.predict(X_val)
print("Confusion_Matrix:")
print(str(confusion_matrix(y_val, y_pred)))
print(classification_report(y_val, y_pred))
print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
# same result as ovo
# reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[284  91   0   0  14   0  31]
 [ 95 255  10   0  51  17   5]
 [  0   2 288  36   8 106   0]
 [  0   0   9 402   0  14   0]
 [  4  28   9   0 368  15   0]
 [  0   8  67  18   5 340   0]
 [ 41   0   0   0   0   0 403]]
              precision    recall  f1-score   support

           1       0.67      0.68      0.67       420
           2       0.66      0.59      0.62       433
           3       0.75      0.65      0.70       440
           4       0.88      0.95      0.91       425
           5       0.83      0.87      0.85       424
           6       0.69      0.78      0.73       438
           7       0.92      0.91      0.91       444

    accuracy                           0.77      3024
   macro avg       0.77      0.77      0.77      3024
weighted avg       0.77      0.77      0.77      3024

Accuracy:0.7738095238095238


In [42]:
modeling(NB) # reject

Confusion_Matrix:
[[ 61   1   2   0 126   6 224]
 [ 62   2  28   4 203  11 123]
 [  0   0 181 258   1   0   0]
 [  0   0   0 425   0   0   0]
 [  0   0  92   0 314   8  10]
 [  0   0 128 256  23  28   3]
 [  4   1   1   0  10   0 428]]
              precision    recall  f1-score   support

           1       0.48      0.15      0.22       420
           2       0.50      0.00      0.01       433
           3       0.42      0.41      0.42       440
           4       0.45      1.00      0.62       425
           5       0.46      0.74      0.57       424
           6       0.53      0.06      0.11       438
           7       0.54      0.96      0.69       444

    accuracy                           0.48      3024
   macro avg       0.48      0.48      0.38      3024
weighted avg       0.48      0.48      0.38      3024

Accuracy:0.47585978835978837


  y = column_or_1d(y, warn=True)


In [43]:
modeling(RF) # accept

  This is separate from the ipykernel package so we can avoid doing imports until


Confusion_Matrix:
[[311  71   0   0  11   0  27]
 [ 77 295   8   0  39  11   3]
 [  0   2 347  18   7  66   0]
 [  0   0   9 413   0   3   0]
 [  1  23   2   0 392   6   0]
 [  0   2  39  15   5 377   0]
 [ 22   0   0   0   0   0 422]]
              precision    recall  f1-score   support

           1       0.76      0.74      0.75       420
           2       0.75      0.68      0.71       433
           3       0.86      0.79      0.82       440
           4       0.93      0.97      0.95       425
           5       0.86      0.92      0.89       424
           6       0.81      0.86      0.84       438
           7       0.93      0.95      0.94       444

    accuracy                           0.85      3024
   macro avg       0.84      0.85      0.84      3024
weighted avg       0.84      0.85      0.84      3024

Accuracy:0.845568783068783


In [44]:
modeling(ABC) # reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[ 89   6   0   0 200   0 125]
 [ 36  21   1   0 307  41  27]
 [  0   1  18   0  22 399   0]
 [  0   0   0   0   0 425   0]
 [  0   9   0   0 370  45   0]
 [  0   0  13   0  30 395   0]
 [ 34   1   0   0   0   0 409]]
              precision    recall  f1-score   support

           1       0.56      0.21      0.31       420
           2       0.55      0.05      0.09       433
           3       0.56      0.04      0.08       440
           4       0.00      0.00      0.00       425
           5       0.40      0.87      0.55       424
           6       0.30      0.90      0.45       438
           7       0.73      0.92      0.81       444

    accuracy                           0.43      3024
   macro avg       0.44      0.43      0.33      3024
weighted avg       0.45      0.43      0.33      3024

Accuracy:0.4305555555555556


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
modeling(GBC) # reject

  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[299  68   0   0  17   0  36]
 [108 243   7   0  57  14   4]
 [  0   1 298  26  14 101   0]
 [  0   0  11 406   0   8   0]
 [  3  22   8   0 377  14   0]
 [  0   6  78  12  17 325   0]
 [ 28   1   0   0   0   0 415]]
              precision    recall  f1-score   support

           1       0.68      0.71      0.70       420
           2       0.71      0.56      0.63       433
           3       0.74      0.68      0.71       440
           4       0.91      0.96      0.93       425
           5       0.78      0.89      0.83       424
           6       0.70      0.74      0.72       438
           7       0.91      0.93      0.92       444

    accuracy                           0.78      3024
   macro avg       0.78      0.78      0.78      3024
weighted avg       0.78      0.78      0.78      3024

Accuracy:0.781415343915344


In [46]:
modeling(XGBC) # reject

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Confusion_Matrix:
[[281  59   0   0  25   0  55]
 [121 196   6   0  92  13   5]
 [  0   0 271  36  15 118   0]
 [  0   0   9 411   0   5   0]
 [  3  24  10   0 367  20   0]
 [  0   0  96  26  19 297   0]
 [ 36   0   0   0   0   0 408]]
              precision    recall  f1-score   support

           1       0.64      0.67      0.65       420
           2       0.70      0.45      0.55       433
           3       0.69      0.62      0.65       440
           4       0.87      0.97      0.92       425
           5       0.71      0.87      0.78       424
           6       0.66      0.68      0.67       438
           7       0.87      0.92      0.89       444

    accuracy                           0.74      3024
   macro avg       0.73      0.74      0.73      3024
weighted avg       0.73      0.74      0.73      3024

Accuracy:0.7377645502645502


In [47]:
modeling(LGBC) # reject

Confusion_Matrix:
[[312  72   0   0   9   0  27]
 [ 97 267   8   0  46  12   3]
 [  0   2 330  17   7  84   0]
 [  0   0   7 413   0   5   0]
 [  0  22   3   0 389  10   0]
 [  1   4  48  12   5 368   0]
 [ 16   0   0   0   0   0 428]]
              precision    recall  f1-score   support

           1       0.73      0.74      0.74       420
           2       0.73      0.62      0.67       433
           3       0.83      0.75      0.79       440
           4       0.93      0.97      0.95       425
           5       0.85      0.92      0.88       424
           6       0.77      0.84      0.80       438
           7       0.93      0.96      0.95       444

    accuracy                           0.83      3024
   macro avg       0.83      0.83      0.83      3024
weighted avg       0.83      0.83      0.83      3024

Accuracy:0.8290343915343915


In [None]:
# conlusion: only use RF, no voter

### Final tuning

In [19]:
param_RF = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 40, 80],
    'criterion': ['gini']
}
rf = RF()
clf_rf = GridSearchCV(rf, param_grid=param_RF, cv=3,n_jobs=-1)
clf_rf.fit(X_train_origin, y_train_origin)
clf_rf.best_params_

{'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'auto',
 'n_estimators': 200}

In [22]:
clf_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Train the best RF model

In [52]:
rf_best = RF(criterion='gini', max_depth=40,
             max_features='auto', n_estimators=200,n_jobs=-1,oob_score=True)
rf_best.fit(X_train_origin, y_train_origin)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [53]:
rf_best.oob_score_  # 0.8585 accuract achieved by the best RF model

0.8585482804232805

In [73]:
importance_list = sorted(zip(rf_best.feature_importances_, X_train_origin.columns.to_list()), reverse=True)
importance_list

[(0.2435204902947709, 'Elevation'),
 (0.09649667894132256, 'Horizontal_Distance_To_Roadways'),
 (0.07814696812795978, 'Horizontal_Distance_To_Hydrology'),
 (0.07798322771441787, 'Horizontal_Distance_To_Fire_Points'),
 (0.0636560341664477, 'Aspect'),
 (0.061000487892991256, 'Hillshade_9am'),
 (0.053620444303198955, 'Hillshade_Noon'),
 (0.04745119252263426, 'Slope'),
 (0.045511389479393614, 'Wilderness_Area4'),
 (0.02365282793444987, 'Soil_Type10'),
 (0.02135918282471204, 'Soil_Type38'),
 (0.01884409998156855, 'Soil_Type3'),
 (0.01872305100853074, 'Wilderness_Area1'),
 (0.01811532987714495, 'Soil_Type39'),
 (0.01758593150412376, 'Wilderness_Area3'),
 (0.012871538990281718, 'Soil_Type4'),
 (0.011109389074177285, 'Soil_Type40'),
 (0.009027039164294749, 'Soil_Type30'),
 (0.006764007225663668, 'Soil_Type17'),
 (0.006464705862647276, 'Soil_Type13'),
 (0.006184539742625265, 'Soil_Type22'),
 (0.006046111134299491, 'Soil_Type2'),
 (0.005792126535632175, 'Soil_Type29'),
 (0.005626657701931169, 'S

In [74]:
joblib.dump(rf_best,'../models/rf.m')

['../models/rf.m']