# Wild Fire Model Evaluation

This notebooks will evaluate severals model on the wildfire dataset in order to make a predecction on fire intensity and CO2 emissions. A down selection will be made to 2-3 models for further hyper-paramater tuning. 

In [1]:
#optional code if warnings become over bearing 
import warnings
warnings.filterwarnings('ignore')

# Import all the models we will likely use for classificaiton

In [11]:
%matplotlib inline

import os
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

from yellowbrick.classifier import ClassificationReport

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

# Read the Fire Data From Disk to Save Time.
During the feature evaluation the data was pulled from the database and stored locally

In [3]:
#This helper function allows for a random sample instead of reading everything

import random

def read_random_sample(sample_size, file_name):


    total_records = 1328922 #number of records in file
    skip = sorted(random.sample(range(total_records), total_records-sample_size))
    
    #don't skip the header line of the file
    if (skip[0] == 0):
        skip[0] = 2

        
    df = pd.read_csv(file_name, skiprows=skip)
    return df
    

In [9]:
#Read the data from disk for faster access

#get a sample of the data to save time - comment out if you want all the data
df_fires = read_random_sample(50000, 'data/FireIntensity_Model_June7_Clean.csv')

#use this line to get all the data
#df_fires = pd.read_csv('data/FireIntensity_Model_June7_Clean.csv')



# Show the shape of the file. 
Check data to ensure it's correct.


In [None]:
#show the shape of the data

print(df_fires.head(5))
print (df_fires.shape)
print (df_fires.describe())
print(df_fires.columns)

# Store the key features.
Eliminated the variables related to directly to fire_intensity due to overfit. 

In [10]:
features = ['doy', 'fire_region', 'season', 'covertype', 'prefire_fuel', 
            'fuel_moisture_class', 'temperature', 'humidity', 'precip_intensity', 
            'visibility', 'wind_speed', 'wind_gust']

target = ['fire_intensity']


y = df_fires[target]
X = df_fires[features]

# Define the models to test
<b>Need to Add a NB Classifier to the list</b>

In [12]:

models = [
    SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(), 
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(), 
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3), 
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=100), 
    RandomForestClassifier(n_estimators=100)
]



# Function to Fit and predict the scores of the model. 
Helps in the initial down selection

In [13]:
def score_model(X, y, estimator, **kwargs):
    """
    Test various estimators.
    """ 
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
         ('one_hot_encoder', OneHotEncoder(categories='auto')), 
         ('estimator', estimator)
    ])

    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)  
    
    expected  = y
    predicted = model.predict(X)
    
    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted, average='macro')))


# Iterate through the different models

In [14]:
for model in models:
    score_model(X, y, model)

SVC: 0.22493002047808716
NuSVC: 0.5317737276953297
LinearSVC: 0.7555194265950275
SGDClassifier: 0.6310169478072348
KNeighborsClassifier: 0.5554706619306066
LogisticRegression: 0.6582424535751203
LogisticRegressionCV: 0.41568845628998474
BaggingClassifier: 0.8822001937726779


KeyboardInterrupt: 

# Function to visualize the model out using yellow brick classification report.
<b> Error is thrown here using the LabelEncoder and one_hot_encoder <b/>

In [None]:
def visualize_model(X, y, estimator):
    """
    Test various estimators.
    """ 
   # y = LabelEncoder().fit_transform(y)
    model = Pipeline([
        # ('one_hot_encoder', OneHotEncoder(categories='auto')), 
         ('estimator', estimator)
    ])
    
   
    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(
        model, 
        classes=['Low', 'Medium', 'High', 'Severe'], 
        cmap="Reds", 
        support=True,
        size=(800, 660)
    )
    
    
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20)
    
    
    visualizer.fit(X_train, y_train)  
    
 
    
    visualizer.score(X_test, y_test)
    
    
    visualizer.show()  

    

In [None]:
# Iterate through the models to visualize

In [None]:
for model in models:
    visualize_model(X, y, model)