# Bagging and ExtraTrees Classifier Wild Fire Model Evaluation

This notebooks will tune and run Bagging and Extra Trees classifier on wildfire dataset in order to make a predecction on fire intensity.


#optional code if warnings become over bearing 
import warnings
warnings.filterwarnings('ignore')

In [25]:
#optional code to ensure everyone is on the right version
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))


The scikit-learn version is 0.23.1.


In [26]:
#optional code if warnings become over bearing 
import warnings
warnings.filterwarnings('ignore')

# Import all the models and libraries needed

In [27]:
%matplotlib inline

import os
import pickle
import pandas as pd
import numpy as np
import collections
import seaborn as sns

from sklearn.utils import resample
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler

from yellowbrick.classifier import ROCAUC
from yellowbrick.model_selection import CVScores
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ClassificationReport

from imblearn.over_sampling import SMOTE
from imblearn import under_sampling, over_sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as smotePipeline





# Helper function to save estimator
Saves to current working directory

In [28]:
def save_estimator (estimator) :
          
    outpath = estimator.__class__.__name__.lower().replace(" ", "-") + ".pickle"
    with open(outpath, 'wb') as f:
        pickle.dump(estimator, f)
        f.close()
        

In [29]:
def open_estimator(filename) :
    
    infile = open(filename,'rb')
    estimator = pickle.load(infile)
    infile.close()

    print ("Unpickled file", filename)
    
    return estimator

### Read the Fire Data From Disk to Save Time.
During the feature evaluation the data was pulled from the database and stored locally
Read a random sample.

Optimal testing data is 150K - SMOTE will generate additonal data for us

In [30]:
df_fires = pd.read_csv('data/FireIntensity_Model_June12_Clean.csv') #.sample(500000)

print(df_fires.shape)

(1328922, 26)


### Assess data in the file. 
Check data to ensure it's correct.


In [31]:
# Expanding number of columns:
pd.set_option('display.max_columns', 40)

In [32]:
# review datagram
print (df_fires.shape)
df_fires.head(100) 

(1328922, 26)


Unnamed: 0.1,Unnamed: 0,latitude,longitude,doy,month,year,cluster_reference,fire_region,season,covertype,fuelcode,prefire_fuel,fuel_moisture_class,temperature,humidity,precip_intensity,visibility,wind_speed,wind_gust,brightness,fire_intensity,fire_intensity_twocat,fire_intensity_threecat,fire_intensity_fourcat,bright_t31,frp
0,0,34.5954,-78.6218,1,1,2003,2003_4279,6,3,3,1600,6220.097576,3,64.14,0.88,0.011,9.022000,6.42,12.510000,306.5,Low,Moderate,Low,Low,289.2,11.0
1,1,33.4182,-110.8618,1,1,2003,2003_1522,4,3,3,1220,4534.187262,2,32.17,0.37,0.000,9.216293,6.88,18.500000,307.6,Low,Moderate,Low,Low,285.1,10.8
2,2,29.7120,-95.1284,1,1,2003,2003_919,3,3,1,1,277.412850,2,65.97,0.50,0.000,9.997000,10.98,16.900000,307.2,Low,Moderate,Low,Low,294.1,5.6
3,3,28.9161,-98.6293,1,1,2003,2003_777,3,3,1,1,251.296812,2,72.89,0.51,0.000,9.997000,25.06,28.590000,313.3,Medium,Moderate,Low,Low,297.4,12.0
4,4,32.7772,-95.0444,1,1,2003,2003_3100,3,3,1,1,173.172870,2,58.39,0.30,0.000,9.216293,7.99,10.990000,301.3,Low,Moderate,Low,Low,289.9,4.2
5,5,32.2024,-94.6048,1,1,2003,2003_3015,3,3,3,1400,6935.101645,2,58.03,0.36,0.000,9.997000,9.16,10.990000,302.1,Low,Moderate,Low,Low,288.5,5.1
6,6,33.3287,-95.5265,1,1,2003,2003_1290,3,3,1,1,393.589989,2,52.50,0.49,0.000,9.997000,13.82,11.681998,317.5,Medium,Moderate,Low,Low,288.3,15.9
7,7,33.3272,-95.5372,1,1,2003,2003_1290,3,3,1,1,393.589989,2,52.50,0.49,0.000,9.997000,13.82,11.681998,307.8,Low,Moderate,Low,Low,288.8,8.3
8,8,30.7567,-96.6769,1,1,2003,2003_2471,3,3,1,1,244.739781,2,64.50,0.59,0.000,9.997000,12.70,17.250000,305.7,Low,Moderate,Low,Low,291.8,6.2
9,9,34.7806,-95.2530,1,1,2003,2003_1445,3,3,3,1400,6849.442774,2,51.17,0.47,0.000,9.997000,6.74,10.010000,307.3,Low,Moderate,Low,Low,289.5,7.7


### Drop the columns not needed identified during the feature seleciton phase
Recommend Features:  'latitude', 'longitude', 'doy', 'temperature', 'wind_speed','humidity', 'fire_region', 'season', 
                 'fuelcode', 'prefire_fuel', 'fuel_moisture_class',
                 'visibility', 'precip_intensity', 'wind_gust'

In [33]:
df_fires = df_fires.drop(columns={'Unnamed: 0','cluster_reference',  
                                  'month', 'year', 'brightness', 'bright_t31','frp', 'visibility', 'covertype',
                                  'fire_intensity', 'fire_intensity_threecat'})

In [34]:
df_fires.columns

Index(['latitude', 'longitude', 'doy', 'fire_region', 'season', 'fuelcode',
       'prefire_fuel', 'fuel_moisture_class', 'temperature', 'humidity',
       'precip_intensity', 'wind_speed', 'wind_gust', 'fire_intensity_twocat',
       'fire_intensity_fourcat'],
      dtype='object')

In [35]:
#show the shape of the data
print(df_fires['fire_intensity_twocat'].value_counts())

Moderate    1311429
Severe        17493
Name: fire_intensity_twocat, dtype: int64


### Setup the features for the pipeline

In [36]:
def find_X_y(df_fires1):

    #remove lat/lon as  test and added fire_region
    features = ['latitude', 'longitude', 'doy','fuelcode', 'fuel_moisture_class', 'prefire_fuel', 'temperature', 'humidity', 'precip_intensity', 
             'wind_gust', 'wind_speed']


    # four categories using balanced binning
    #target = ['fire_intensity_fourcat']
    target = ['fire_intensity_twocat']
    
    y = df_fires1[target]
    y = np.ravel(y)
    X = df_fires1[features]
    
    # Not really needed already 1-4 numerical
    #y = LabelEncoder().fit_transform(y) # Label-encode targets before modeling
    
    return X, y



In [37]:
X, y = find_X_y(df_fires)
X.shape

(1328922, 11)

### Setup the columns transfomer for the pipeline.

In [None]:
# Setup

''' 
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('imputer', SimpleImputer(strategy='most_frequent'))])

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median'))])

# Setting the transformation for the pipeline below. 
preprocessor = ColumnTransformer (transformers=[
        ('num_features', numeric_transformer, num_features),
        ('cat_features', categorical_transformer, cat_features)])
        
'''

model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# Define the models to test
<b>Down to our best 2 models</b>

In [38]:
models = [     
            ExtraTreesClassifier(n_estimators=5),
            #RandomForestClassifier(n_estimators=5)
            #RandomForestClassifier(n_estimators=2)
            #BaggingClassifier(n_estimators=3),            
        ]    


In [39]:
def smote_data(X, y):
    
    
    #over_range = {'Severe':20000}
    
    # define pipeline
    #over = SMOTE(sampling_strategy=.1)
    #over = SMOTE(sampling_strategy=over_range)
    #X, y = over.fit_resample(X, y)
    
    #under = RandomUnderSampler(sampling_strategy=.5)
    
    under_range = {'Moderate' :250000}
    under = RandomUnderSampler(sampling_strategy=under_range)
    X, y = under.fit_resample(X,y)
    
    #model_dt = DecisionTreeClassifier()
    #steps = [('over_sample', over), ('under_sample', under), ('model', model_dt)]
    #s_pipeline =  smotePipeline(steps=steps)
    # transform the dataset
    #X, y = s_pipeline.fit_resample(X, y)
    
    #smote models for balance bins for classifier 
    #oversample = SMOTE()
    #X, y = oversample.fit_resample(X, y)
    return X, y

In [40]:
collections.Counter(y)


Counter({'Moderate': 1311429, 'Severe': 17493})

In [41]:
X, y = smote_data(X, y)

In [42]:
collections.Counter(y)


Counter({'Moderate': 250000, 'Severe': 17493})

In [43]:
X.shape

(267493, 11)

# Function to Fit and predict the scores of the model. 
Give us our final scores

In [45]:
def score_model(X, y, estimator, **kwargs):
   

    #define category features to be one hot encoded
    cat_features = ['fuelcode', 'fuel_moisture_class']
     
    #define numeric features for scaling
    num_features = ['longitude', 'latitude', 'doy', 'prefire_fuel', 'temperature', 'humidity', 'precip_intensity', 'wind_gust', 'wind_speed']
    
    #setup simple one hot encoder
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('imputer', SimpleImputer(strategy='most_frequent'))])

    #setup basic
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='median'))])


    # Setting the transformation for the pipeline below. 
    preprocessor = ColumnTransformer (transformers=[
        ('num_features', numeric_transformer, num_features),
        ('cat_features', categorical_transformer, cat_features)])
    
   
    #setup the pipeline
    model = Pipeline(steps=[
          ('preprocessor', preprocessor),
          ('estimator', estimator)
      ])
    
    # Perform 80/20 training/test split
    X_train, X_test, y_train, y_test = tts(
        X, y, test_size=0.20, random_state=42
    )
    

    #Create a cross-validation strategy
    cv = StratifiedKFold(n_splits=5, random_state=42)

    '''
    # Instantiate the classification model and visualizer
    visualizer = CVScores(model, cv=cv, scoring='f1_weighted')

    visualizer.fit(X_train, y_train)        # Fit the data to the visualizer
    visualizer.show()           # Finalize and render the figure
    
    '''
    
    # Train the model on the smote data
    model.fit(X, y, **kwargs)  
    
  
    #testing against smote dataset
    expected = y_test
    predicted = model.predict(X_test)
    
   
    ''' 
    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__ + " : Smote data F1 Micro Score ", f1_score(expected, predicted, average='micro')))
    print("{}: {}".format(estimator.__class__.__name__ + " : Smote data F2 Macro Score ", f1_score(expected, predicted, average='macro')))
   
    print(classification_report(expected, predicted))
    
    '''

    #Get new non smote data to test the model
    df_fires1 = pd.read_csv('data/FireIntensity_Model_June12_Clean.csv').sample(50000)
    X1, y1 = find_X_y(df_fires1)
    
    #testing against note smote dataset
    expected = y1
    predicted = model.predict(X1)
    
     # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__ + " : Original data F1 Micro Score ", f1_score(expected, predicted, average='micro')))
    print("{}: {}".format(estimator.__class__.__name__ + " : Original data F2 Macro Score ", f1_score(expected, predicted, average='macro')))
    
    print(classification_report(expected, predicted))

    #save estimator to disk
    save_estimator(model)
    



# Iterate through the different models

In [46]:
for model in models:
    score_model(X, y, model)

ExtraTreesClassifier : Original data F1 Micro Score : 0.97144
ExtraTreesClassifier : Original data F2 Macro Score : 0.7303530838294194
              precision    recall  f1-score   support

    Moderate       1.00      0.97      0.99     49353
      Severe       0.31      1.00      0.48       647

    accuracy                           0.97     50000
   macro avg       0.66      0.99      0.73     50000
weighted avg       0.99      0.97      0.98     50000



# Function to visualize the model out using yellow brick classification report.


In [None]:
def visualize_model(X, y, estimator):
        
    
    # Processing y.
    y = LabelEncoder().fit_transform(y) # Label-encode targets before modeling
    
   
    #smote models for balance bins for classifier 
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('estimator', estimator)
    ])
   

    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33)

    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(
        model,
        classes=['Low', 'Medium', 'High', 'Severe'], # Classes for equal balanced bins per quartiles
        cmap="Reds", 
        support=True,
        size=(800, 660)
    )

    model.fit(X_train, y_train)

    visualizer.fit(X_train, y_train)  
        
    visualizer.score(X_test, y_test)
 
    
    #optional
    #visualizer.show(outpath=estimator.__class__.__name__ + ".png")  
    
    visualizer.show()
    



# Iterate through the models to visualize

In [None]:
for model in models:
    visualize_model(X, y, model)
    

In [None]:
def confusion_matrix_report (X, y, estimator, **kwargs):
    
    
    # Processing y.
    y = LabelEncoder().fit_transform(y) # Label-encode targets before modeling
    
   
    #smote models for balance bins for classifier 
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
   
    model = Pipeline(steps=[
          ('preprocessor', preprocessor),
          ('estimator', estimator)
      ])
    
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33)
    
    # Instantiate the classification model and visualizer
    model.fit(X_train, y_train, **kwargs)  
    
    expected =  y_test
    predicted = model.predict(X_test)
    
    print(confusion_matrix(expected, predicted))
    print(classification_report(expected, predicted))

    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted, average='micro')))

   
    return save_estimator(estimator)

In [None]:
for model in models:
    confusion_matrix_report(X, y, model)

---
#### ROCAUC For Bagging Classifier 

In [None]:

# Instantiate the classification model and visualizer
visualizer = ROCAUC(BaggingClassifier(), size=(1080, 720))
                                           
# Create the train and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3)


visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Draw the data

### ROCAUC Curve for Extra Trees

In [None]:
# Instantiate the classification model and visualizer
visualizer = ROCAUC(ExtraTreesClassifier(n_estimators=10), classes=classes, size=(1080, 720))
                                           
# Create the train and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3)


visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Draw the data


### Examine Class Prediction Error

In [None]:

def class_pred_error (X,y, estimator):


    classes=['Low', 'Medium', 'High', 'Severe']

    
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
   
    model = Pipeline(steps=[
          ('preprocessor', preprocessor),
          ('estimator', estimator)
      ])
    
    # Perform 80/20 training/test split
    X_train, X_test, y_train, y_test = tts(
        X, y, test_size=0.20, random_state=42
    )

    # Instantiate the classification model and visualizer
    '''
    visualizer = ClassPredictionError(
    RandomForestClassifier(n_estimators=10), 
    classes=classes, size=(1080, 720)
    )
    '''
    
    visualizer = ClassPredictionError(
        model, 
        classes=classes, size=(1080, 720)
    )
    model.fit(X,y)

    # Fit the training data to the visualizer
    visualizer.fit(X_train, y_train)

    # Evaluate the model on the test data
    visualizer.score(X_test, y_test)

    # Draw visualization
    visualizer.show()



In [None]:
for model in models:
    class_pred_error(X, y, model)

### Final F1 Cross Validated Scores


In [None]:
def F1_Cross_Validated_Score (X, y, estimator):


    classes=['Low', 'Medium', 'High', 'Severe']

    
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
   
    model = Pipeline(steps=[
          ('preprocessor', preprocessor),
          ('estimator', estimator)
      ])
    
    
    visualizer = ClassPredictionError(
        model, 
        classes=classes, size=(1080, 720)
    )
    model.fit(X,y)

    # Create a cross-validation strategy
    cv = StratifiedKFold(n_splits=5, random_state=42)

    visualizer = CVScores(
        model, cv=cv, size=(1080, 720)
    )

    visualizer.fit(X, y)
    visualizer.show()
    
    
    # Perform 80/20 training/test split
    X_train, X_test, y_train, y_test = tts(
        X, y, test_size=0.20, random_state=42
    )
    
    expected =  y_test
    predicted = model.predict(X_test)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted, average='micro')))

       
    return save_estimator(estimator)

In [None]:
for model in models:
    F1_Cross_Validated_Score(X, y, model)

### Final Summary
ExtraTrees Classifier yeilds a slight better F1 score and is noteably faster to run over bagging.
The precision and recall are slightly better for ExtraTrees.  Med to High fires score the lowest.
Educated guess is the more extreme high and low fires are easier to predict than the ones clumped in the middle.
The report recommends using ExtraTrees as our final Estimator.




### Open the lastest estimator and test

In [None]:
est = open_estimator('pipeline.pickle')