In [4]:
%config Completer.use_jedi = False
# Import Python 3's print function and division
from __future__ import print_function, division

#Loading data
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler
#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

#plots
import seaborn as sns
import matplotlib.pylab as plt
import matplotlib.patches as mpatches
from collections import OrderedDict

cmaps = OrderedDict()

#metrics
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.metrics import recall_score,precision_score, cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve

#cross validations
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from osgeo import gdal
from osgeo import gdal_array
from osgeo import osr

# Define random state
random_state = 2019
np.random.seed(random_state)
warnings.filterwarnings('ignore')

#librerias de sistema
import os



# latex parameter
font = {
    'family': 'serif', 
    'serif': ['Computer Modern Roman'],
    'weight' : 'regular',
    'size'   : 8
    }

plt.rc('font', **font)

In [4]:
#slope --    CTE
#soil type---CTE
#land use ---- Variable 
#Bearing capacity .. simulate scenarios for 2007 starting from 1999 bearing capacity map

#Hazard level 2001 for training and 2007 for validation

<img src="D:/Proyectos_GitHub/PISCO_Peligro/img/idea_1.png">

In [None]:
#Bajo 1
#Medio 2
#Alto 3
#Muy Alto 4

#### TIPOS de Suel

In [5]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models
        self.metriks={}
        self.RE=[]
        self.F1=[]
        self.PRE=[]#

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        no_class = len(np.unique(y))

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_proba = np.zeros((X.shape[0], no_class))
        test_proba = np.zeros((T.shape[0], no_class))
        
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        test_pred = np.zeros((T.shape[0], len(self.base_models)* self.n_splits))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        
        test_col = 0
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                precision=precision_score(Y_valid,valid_pred,average='macro')#
                f1 = f1_score(Y_valid, valid_pred, average='macro')
                
                recall_scores[i][j] = recall
                f1_scores[i][j] = f1
                
                train_pred[valid_idx, i] = valid_pred
                test_pred[:, test_col] = clf.predict(T)
                test_col += 1
                
                ## Probabilities
                valid_proba = clf.predict_proba(X_valid)
                train_proba[valid_idx, :] = valid_proba
                test_proba  += clf.predict_proba(T)
                
                self.RE.append(recall)
                self.F1.append(f1)
                self.PRE.append(precision)#
                print( "Model- {} and CV- {} recall: {}, precision: {}, f1_score: {}".format(i, j, recall,precision, f1))
                
            test_proba /= self.n_splits
            self.metriks['recall']=self.RE
            self.metriks['f1']=self.F1
            self.metriks['precision']=self.PRE#
            
        return train_proba, test_proba, train_pred, test_pred,self.metriks

# Loading Data and splitting it into 80% training and 20% test data

In [13]:
data=pd.read_csv('D:/Proyectos_GitHub/PISCO_Peligro/data/CSV/data.csv')
features=data.iloc[:,0:4]
target=data.loc[:,'hazard_value']
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size=0.20, random_state=42)

# Best parameters grid search for RF

In [17]:
cv = StratifiedKFold(n_splits = 3, shuffle=True, random_state = random_state)

rdf = RandomForestClassifier(random_state = random_state) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
          }

params = {'max_depth': [5, 10, 20,30], 
              'n_estimators' : [200,250,300,350,400,450,500]
             }
grid_clf= RandomizedSearchCV(estimator = rdf, param_distributions = params, n_iter = 50, cv = cv, verbose=4, n_jobs = -1)
#grid_clf = GridSearchCV(estimator = rdf, param_grid = params, cv = cv, n_jobs=-1, verbose=4)
grid_clf.fit(xtrain, ytrain)

print(grid_clf.best_estimator_)
print(grid_clf.best_params_)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 out of  84 | elapsed:   10.8s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  81 out of  84 | elapsed:   14.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:   14.7s finished


RandomForestClassifier(max_depth=10, n_estimators=350, random_state=2019)
{'n_estimators': 350, 'max_depth': 10}


# CROSS VALIDATION

In [18]:
rdf = RandomForestClassifier(bootstrap=True, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=-1,
            oob_score=False,
            random_state=random_state,
            verbose=0, warm_start=False)

base_models = [rdf]
n_splits = 5
rf_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)     
train_proba, test_proba, train_pred, test_pred,indi_rf = rf_stack.predict(xtrain, ytrain, xtest)

Model- 0 and CV- 0 recall: 0.8504330228323599, precision: 0.878799431282086, f1_score: 0.8635543209160872
Model- 0 and CV- 1 recall: 0.8408767944840078, precision: 0.8487704072449835, f1_score: 0.8447276180464401
Model- 0 and CV- 2 recall: 0.8364306962112003, precision: 0.8767947533190457, f1_score: 0.8543920134367721
Model- 0 and CV- 3 recall: 0.8043701994387465, precision: 0.8886374413415353, f1_score: 0.8365417260900726
Model- 0 and CV- 4 recall: 0.80965812156929, precision: 0.8865218245499936, f1_score: 0.8399254279876885


In [24]:
print('1. The F-1 score of the model {}\n'.format(f1_score(ytrain, train_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(ytrain, train_pred, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(ytrain, train_pred)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(ytrain, train_pred)))

1. The F-1 score of the model 0.8481845352251338

2. The recall score of the model 0.8284159179167331

3. Classification report 
               precision    recall  f1-score   support

           2       0.97      0.98      0.98      9867
           3       0.98      0.98      0.98     15359
           4       0.67      0.53      0.59       342

    accuracy                           0.97     25568
   macro avg       0.87      0.83      0.85     25568
weighted avg       0.97      0.97      0.97     25568
 

4. Confusion matrix 
 [[ 9668   194     5]
 [  237 15038    84]
 [   33   129   180]] 

