# Wild Fire Model Evaluation

This notebooks will evaluate severals model on the wildfire dataset in order to make a predecction on fire intensity and CO2 emissions. A down selection will be made to 2-3 models for further hyper-paramater tuning. 

In [1]:
#optional code if warnings become over bearing 
import warnings
warnings.filterwarnings('ignore')

In [2]:
#optional code to ensure everyone is on the right version
import sklearn

print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.20.3.


# Import all the models we will likely use for classificaiton

In [3]:
%matplotlib inline

import os
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

from yellowbrick.classifier import ClassificationReport

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

import os
import pickle


# Helper function to save estimator
Saves to current working directory

In [4]:
def save_estimator (estimator) :
          
    outpath = estimator.__class__.__name__.lower().replace(" ", "-") + ".pickle"
    with open(outpath, 'wb') as f:
        pickle.dump(estimator, f)
        f.close()
        
    #print("\nFitted model written to:\n{}".format(os.path.abspath(outpath)))

In [5]:
def open_estimator(filename) :
    
    infile = open(filename,'rb')
    estimator = pickle.load(infile)
    infile.close()

    print ("Unpickled file", filename)
    
    return estimator

# Read the Fire Data From Disk to Save Time.
During the feature evaluation the data was pulled from the database and stored locally

In [7]:
#Read the data from disk for faster access

#get a sample of the data to save time - comment out if you want all the data
df_fires = pd.read_csv('data/FireIntensity_Model_June7_Clean.csv').sample(frac = 0.20)




# Show the shape of the file. 
Check data to ensure it's correct.


In [8]:
#show the shape of the data

print(df_fires.head(3))
print (df_fires.shape)
print (df_fires.describe())
print(df_fires.columns)

   latitude  longitude  doy  month  year cluster_reference  fire_region  \
0   26.5985   -81.4620    2      1  2003          2003_811            2   
1   32.4349  -109.9679    2      1  2003         2003_3004            4   
2   30.9114   -92.7975    3      1  2003         2003_1092            0   

   season  covertype  fuelcode  ...  temperature  humidity  precip_intensity  \
0       3          1         1  ...        75.18      0.67               0.0   
1       3          3      1920  ...        52.58      0.20               0.0   
2       3          3      1160  ...        43.47      0.51               0.0   

   visibility  wind_speed  wind_gust  brightness  fire_intensity  bright_t31  \
0       9.997        3.90  11.681998       305.5             Low       287.0   
1      10.000        9.33  13.040000       326.7            High       291.8   
2       9.446        5.10  10.770000       310.0          Medium       286.9   

    frp  
0   5.2  
1  28.7  
2  10.9  

[3 rows x 22 col

# Store the key features.
Eliminated the variables related directly to fire_intensity due to overfit. 
Consider eliminating doy

In [9]:
features = ['fire_region', 'season', 'fuelcode', 'prefire_fuel', 
            'fuel_moisture_class', 'temperature', 'humidity', 
             'wind_speed']

target = ['fire_intensity']


y = df_fires[target]
X = df_fires[features]


#encode_columns = ('fire_region', 'season', 'covertype', 'fuel_moisture_class')
#onehot_list = df_fires[encode_columns]

In [10]:
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer((OneHotEncoder(), ['fire_region', 'season',
                                'fuelcode', 'fuel_moisture_class']),
                                 remainder='passthrough')
    
column_trans.fit_transform(X)
y = LabelEncoder().fit_transform(y)


# Define the models to test
<b>Commented out slower performing models and limited the interations/estimators for speed</b>

In [11]:
'''
    models = [
        SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(), 
        SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(), 
        LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3), 
        BaggingClassifier(), ExtraTreesClassifier(n_estimators=100), 
        RandomForestClassifier(n_estimators=100),
        ]
        
'''


models = [
        SVC(gamma='auto'), 
        NuSVC(gamma='auto'),
        #LinearSVC(),
        #SGDClassifier(max_iter=25, tol=1e-3),  
        #LogisticRegression(solver='lbfgs'),
        BaggingClassifier(), 
        ExtraTreesClassifier(n_estimators=20), 
        RandomForestClassifier(n_estimators=20),
        ]



# Function to Fit and predict the scores of the model. 
Helps in the initial down selection

In [12]:
def score_model(X, y, estimator, **kwargs):
    
    """
    Test various estimators.
   
   
    #y = LabelEncoder().fit_transform(y)

   
    
    model = Pipeline([
         ('one_hot_encoder', OneHotEncoder(categories = 'auto')), 
         ('estimator', estimator)
    ])
    """ 
    
    model = Pipeline([
         ('trans', column_trans), 
         ('estimator', estimator)
    ])
   
  
    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)  
    
    expected = y
    predicted = model.predict(X)
    
   
    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted, average='macro')))
   
    save_estimator(estimator)
    
    


# Iterate through the different models

In [15]:
for model in models:
    score_model(X, y, model)

SVC: 0.49312927908581494
NuSVC: 0.6393926867621572
BaggingClassifier: 0.894489237381965
ExtraTreesClassifier: 0.9089920815952419
RandomForestClassifier: 0.9066119596829675


# Function to visualize the model out using yellow brick classification report.
<b> Error is thrown here using the LabelEncoder and one_hot_encoder <b/>

In [14]:
def visualize_model(X, y, estimator):
    
    """
    Test various estimators.
    
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
         ('one_hot_encoder', OneHotEncoder(categories='auto')), 
         ('estimator', estimator)
    ])
    """ 
    
    model = Pipeline([
         ('trans', column_trans), 
         ('estimator', estimator)
    ])
   
   
    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(
        model, 
        classes=['Low', 'Medium', 'High', 'Severe'], 
        cmap="Reds", 
        support=True,
        size=(800, 660)
    )
    
    
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20)
    
    
    visualizer.fit(X_train, y_train)  
    
    
    visualizer.score(X_test, y_test)
    
    #optional
    #visualizer.show(outpath=estimator.__class__.__name__ + ".png")  
    
    visualizer.show()
 

    

# Iterate through the models to visualize

In [None]:
for model in models:
    visualize_model(X, y, model)
    