## Still to do: Print/graph evaluation in digestible format, run on PCA, QA

In [1]:
#import packages
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
import rtree  # supports geospatial join
import os
import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import sys
import pickle
from shapely.ops import nearest_points
from datetime import datetime as dt, date
sys.path.append('/Users/jackepstein/Documents/GitHub/wildfires-1001/code/functions/')
from modeling_functions import *
data_dir = '/Users/jackepstein/Documents/GitHub/wildfires-1001/data'
code_dir = '/Users/jackepstein/Documents/GitHub/wildfires-1001/code'
model_dir = '/Users/jackepstein/Documents/GitHub/wildfires-1001/models'

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import scale, label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Pull in main data frame

In [3]:
#pull in the target data frame and weather dictionary 
#make sure to change the pkl file name if needed
target_dict = {}
target_df = gpd.GeoDataFrame()
for i in np.arange(1, 3):
    target_dict[i] = pd.read_pickle(os.path.join(data_dir, f'clean_data/target_df_final_1123_newtargets_{i}.pkl')) 
    target_df = target_df.append(target_dict[i])


weather_dict_path = os.path.join(data_dir, 'clean_data/ERA_weather-data/ERA_rename_dictionary.pkl')

In [4]:
#load the naming dictionary
with open(weather_dict_path, 'rb') as handle:
    rename_dict = pickle.load(handle)

In [5]:
#rename the columns based on this dictionary
target_df.rename(columns = rename_dict, inplace = True)

In [6]:
#create lists of columns to drop and what our targets are
non_mod_cols = ['GRID_ID','month_id','MONTH','COUNTYFP','NAME','GRID_AREA','COUNTY_ARE','COUNTY_AREA',
                'geometry', 'Fire_area','Index','index']
bad_features = ['hist_p_time_1m', 'total_fire_days', 'hist_p_time_1y','month_id_old']
Y_cols = ['Y_bin', 'Y_fire_count', 'Y_fire_area_prop', 'Y_fire_class_size','Y_bin_new_fire_month',
          'Y_max_new_fire_size_month','Y_count_new_fires_month']

In [7]:
#convert floats from 64 to 32 for model
for col in target_df.columns:
    if target_df[col].dtypes == 'float64':
        target_df[col] = target_df[col].astype(np.float32)

# Pull in Models and Feature Lists

In [8]:
#pull in models

#list of models
model_list = ['LR_15PCA_1990_2015.pkl', 'LR_30entropy_1990_2015.pkl', 'linSVC_25PCA_1990_2015.pkl', 
              'LR_15PCA_1990_2005.pkl', 'LR_20gini_1990_2005.pkl', 'linSVC_15PCA_1990_2005.pkl', 
              'linSVC_30gini_1990_2005.pkl', 'linSVC_35entropy_1990_2015.pkl']


#get all paths for loading
model_path_list = []
for m in model_list:
    mod_path = os.path.join(model_dir, m)
    model_path_list.append(mod_path)

#load the models into a dictionary with the file as the key and the model as the value
models = {}
for m in range(len(model_list)):
    with open(model_path_list[m], 'rb') as handle:
        models[model_list[m]] = pickle.load(handle)

In [9]:
#pull in feature lists

#w ill need 30e
feat_list = ['RF_entropy_top30_features.pkl', 'RF_gini_top20_features_1990_2005.pkl', 
             'RF_gini_top30_features_1990_2005.pkl', 'RF_entropy_top35_features.pkl']

#get paths for loading
feat_path_list = []
for f in feat_list:
    feat_path = os.path.join(model_dir,'feature_lists',f)
    feat_path_list.append(feat_path)

features = {}
for f in range(len(feat_list)):
    with open(feat_path_list[f], 'rb') as handle:
        features[feat_list[f]] = pickle.load(handle)

# Split training and testing data

### This is the split where using the 1990-2015 data all as training

In [10]:
#generate training data set
#pre 2016
train_data = target_df[target_df['YEAR']<2016]
X_train = train_data.drop('YEAR', axis = 1)
#drop columns not used for modeling - dont drop Ys here
for y in Y_cols + non_mod_cols + bad_features:
    try:
        X_train.drop(y, inplace = True, axis =1)
    except:
        pass
#set up target variable
Y_train_cl = train_data[['Y_bin_new_fire_month']]
Y_train_cl_size = train_data[['Y_max_new_fire_size_month']]
Y_train_cl_arr = Y_train_cl.to_numpy().ravel()
Y_train_size_arr = Y_train_cl_size.to_numpy().ravel()

#generate testing data set - same logic as above
test_data = target_df[target_df['YEAR']>=2016]
X_test = test_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols + bad_features:
    try:
        X_test.drop(y, inplace = True, axis =1)
    except:
        pass
Y_test_cl = test_data[['Y_bin_new_fire_month']]
Y_test_cl_size = test_data[['Y_max_new_fire_size_month']]
Y_test_cl_arr = Y_test_cl.to_numpy().ravel()
Y_test_size_arr = Y_test_cl_size.to_numpy().ravel()

### This is the split where using the 1990-2005 on initial test, 2006-2015 on 2nd stage and 2016-2019 on final testing

In [11]:
#generate phase1 data set
#pre 2006
phase1_data = target_df[target_df['YEAR']<2006]
X_phase1 = phase1_data.drop('YEAR', axis = 1)
#drop columns not used for modeling - dont drop Ys here
for y in Y_cols + non_mod_cols + bad_features:
    try:
        X_phase1.drop(y, inplace = True, axis =1)
    except:
        pass
#set up target variable
Y_ph1_cl = phase1_data[['Y_bin_new_fire_month']]
Y_ph1_cl_size = phase1_data[['Y_max_new_fire_size_month']]
Y_ph1_cl_arr = Y_ph1_cl.to_numpy().ravel()
Y_ph1_cl_size_arr = Y_ph1_cl_size.to_numpy().ravel()

#generate phase2 data set - same logic as above
phase2_data = target_df[(target_df['YEAR']>=2006)&(target_df['YEAR']<2016)]
X_phase2 = phase2_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols + bad_features:
    try:
        X_phase2.drop(y, inplace = True, axis =1)
    except:
        pass
Y_ph2_cl = phase2_data[['Y_bin_new_fire_month']]
Y_ph2_cl_size = phase2_data[['Y_max_new_fire_size_month']]
Y_ph2_cl_arr = Y_ph2_cl.to_numpy().ravel()
Y_ph2_cl_size_arr = Y_ph2_cl_size.to_numpy().ravel()

#generate phase3 (test) data set
phase3_data = target_df[target_df['YEAR']>=2016]
X_phase3 = phase3_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols + bad_features:
    try:
        X_phase3.drop(y, inplace = True, axis =1)
    except:
        pass
Y_ph3_cl = phase3_data[['Y_bin_new_fire_month']]
Y_ph3_cl_size = phase3_data[['Y_max_new_fire_size_month']]
Y_ph3_cl_arr = Y_ph3_cl.to_numpy().ravel()
Y_ph3_cl_size_arr = Y_ph3_cl_size.to_numpy().ravel()

In [12]:
#scale all data sets
X_train_scaled = pd.DataFrame(scale(X_train), columns = X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scale(X_test), columns = X_test.columns, index=X_test.index)
X_phase1_scaled = pd.DataFrame(scale(X_phase1), columns = X_test.columns, index=X_phase1.index)
X_phase2_scaled = pd.DataFrame(scale(X_phase2), columns = X_test.columns, index=X_phase2.index)
X_phase3_scaled = pd.DataFrame(scale(X_phase3), columns = X_test.columns, index=X_phase3.index)

# Test 1: Taking LR model, 30entropy features, 1990-2015

### 1A. Take all instances in the training set that are predicted binary positive and test size classification on the testing set

In [13]:
#1. slim down features
X_tr_sc_30featentr = X_train_scaled[features['RF_entropy_top30_features.pkl']]
X_test_sc_30featentr = X_test_scaled[features['RF_entropy_top30_features.pkl']]

In [14]:
#2. run model on train set to get predictions
y_preds_test1 = models['LR_30entropy_1990_2015.pkl'].predict(X_tr_sc_30featentr)

In [15]:
#3. store predictions and filter

#append the these predictions to the dataframe
preds = pd.DataFrame(y_preds_test1, columns=['preds'], index=X_tr_sc_30featentr.index)
X_tr_sc_30featentr_preds = X_tr_sc_30featentr.merge(preds, how='outer', left_index=True, right_index=True)


#rejoin with the y-size column
X_tr_sc_30featentr_ysize = X_tr_sc_30featentr_preds.merge(Y_train_cl_size, how='outer', left_index=True, right_index=True)


#filter for the positive predicted instances
X_tr_sc_30featentr_cut = X_tr_sc_30featentr_ysize.loc[X_tr_sc_30featentr_ysize['preds']==1]


#drop preds and class_size columns, while saving class size as a new training y
Y_train_cl_size_cut1 = X_tr_sc_30featentr_cut['Y_max_new_fire_size_month']
X_tr_sc_30featentr_ready = X_tr_sc_30featentr_cut.drop(columns=['preds','Y_max_new_fire_size_month'])

In [16]:
#regularization hyperparam options
cs = [10**i for i in range(-4, 2)] 

In [17]:
#4. run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t1 = {}
aucs_lr_t1 = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_tr_sc_30featentr_ready, 
                                                                               Y_train_cl_size_cut1.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, lr.predict(X_test_sc_30featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_test_sc_30featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t1[c] = np.mean(aucs_sub)
    conf_mats_lr_t1[c] = cm

In [18]:
aucs_lr_t1

{0.0001: 0.796473482371128,
 0.001: 0.8314413758249598,
 0.01: 0.8428852785540163,
 0.1: 0.8459346572328914,
 1: 0.846694214876033,
 10: 0.8468562340210476}

In [19]:
#5. run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t1 = {}
aucs_svm_t1 = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_tr_sc_30featentr_ready, 
                                                                               Y_train_cl_size_cut1.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, svm.predict(X_test_sc_30featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_test_sc_30featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t1[c] = np.mean(aucs_sub)
    conf_mats_svm_t1[c] = cm

In [20]:
aucs_svm_t1

{0.0001: 0.34535569891194484,
 0.001: 0.7812971936500387,
 0.01: 0.8308326892205243,
 0.1: 0.8658377430287175,
 1: 0.8685274094773768,
 10: 0.868768208573637}

### 1B. Same test data set and same features, only predicting on the actual positive instances

In [21]:
#filter for only the postitve instances (not positive preds)
X_tr_sc_30featentr_cut2 = X_tr_sc_30featentr_ysize.loc[X_tr_sc_30featentr_ysize['Y_max_new_fire_size_month']>0]


#drop preds and class_size columns, while saving class size as a new training y
Y_train_cl_size_cut2 = X_tr_sc_30featentr_cut2['Y_max_new_fire_size_month']
X_tr_sc_30featentr_ready2 = X_tr_sc_30featentr_cut2.drop(columns=['preds','Y_max_new_fire_size_month'])

In [22]:
#run LR on all Cs and store all confusion matrices and AUCs
#still need to store AUCs - HAVING ISSUES BECAUSE WE DONT HAVE ANY 0'S (MAYBE DO N=3)
conf_mats_lr_t1b = {}
aucs_lr_t1b = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_tr_sc_30featentr_ready2, 
                                                                               Y_train_cl_size_cut2.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, lr.predict(X_test_sc_30featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_test_sc_30featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t1b[c] = np.mean(aucs_sub)
    conf_mats_lr_t1b[c] = cm

In [23]:
aucs_lr_t1b

{0.0001: 0.736455353049966,
 0.001: 0.7168862526279184,
 0.01: 0.7091852662693833,
 0.1: 0.7122557814204195,
 1: 0.7129216524666867,
 10: 0.7129690735501004}

In [24]:
#run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t1b = {}
aucs_svm_t1b = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_tr_sc_30featentr_ready2, 
                                                                               Y_train_cl_size_cut2.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, svm.predict(X_test_sc_30featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_test_sc_30featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t1b[c] = np.mean(aucs_sub)
    conf_mats_svm_t1b[c] = cm

In [25]:
aucs_svm_t1b

{0.0001: 0.271604255251885,
 0.001: 0.5739205775887959,
 0.01: 0.695472669648926,
 0.1: 0.7131666613976574,
 1: 0.7149736022635664,
 10: 0.7151563710225565}

# Test 2: Taking LR model, 20gini features, train on 2006-2015

### 2A. Use predictive positives in 2006-2015 as training data

In [26]:
#use the 'LR_20gini_1990_2005.pkl' model
#use the 'RF_gini_top20_features_1990_2005.pkl' feature list
#key -- use 1990-2005 to get predictions of positive instances in 2006-2015 using imported model
#then use these positive predictions to train LR/SVM models for class size

In [27]:
#1. slim down features
X_ph1_sc_20gi = X_phase1_scaled[features['RF_gini_top20_features_1990_2005.pkl']]
X_ph2_sc_20gi = X_phase2_scaled[features['RF_gini_top20_features_1990_2005.pkl']]
X_ph3_sc_20gi = X_phase3_scaled[features['RF_gini_top20_features_1990_2005.pkl']]

In [28]:
#2. run model on train set to get predictions
y_preds_test2 = models['LR_20gini_1990_2005.pkl'].predict(X_ph2_sc_20gi)

In [29]:
#3. store predictions and filter

#append the these predictions to the dataframe
preds2 = pd.DataFrame(y_preds_test2, columns=['preds'], index=X_ph2_sc_20gi.index)
X_ph2_sc_20gi_preds = X_ph2_sc_20gi.merge(preds2, how='outer', left_index=True, right_index=True)


#rejoin with the y-size column
X_ph2_sc_20gi_ysize = X_ph2_sc_20gi_preds.merge(Y_ph2_cl_size, how='outer', left_index=True, right_index=True)


#filter for the positive predicted instances
X_ph2_sc_20gi_cut = X_ph2_sc_20gi_ysize.loc[X_ph2_sc_20gi_ysize['preds']==1]


#drop preds and class_size columns, while saving class size as a new training y
Y_ph2_cl_size_cut1 = X_ph2_sc_20gi_cut['Y_max_new_fire_size_month']
X_ph2_sc_20gi_ready = X_ph2_sc_20gi_cut.drop(columns=['preds','Y_max_new_fire_size_month'])

In [30]:
#4. run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t2 = {}
aucs_lr_t2 = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_ph2_sc_20gi_ready, 
                                                                               Y_ph2_cl_size_cut1.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, lr.predict(X_ph3_sc_20gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_ph3_sc_20gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t2[c] = np.mean(aucs_sub)
    conf_mats_lr_t2[c] = cm

In [31]:
aucs_lr_t2

{0.0001: 0.7056238480290148,
 0.001: 0.7542489149176527,
 0.01: 0.8331031868719903,
 0.1: 0.8477993638147333,
 1: 0.8496573815327902,
 10: 0.8498513585825554}

In [32]:
#5. run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t2 = {}
aucs_svm_t2 = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_ph2_sc_20gi_ready, 
                                                                               Y_ph2_cl_size_cut1.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, svm.predict(X_ph3_sc_20gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_ph3_sc_20gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t2[c] = np.mean(aucs_sub)
    conf_mats_svm_t2[c] = cm

In [33]:
aucs_svm_t2

{0.0001: 0.19141670134966407,
 0.001: 0.4899481241453118,
 0.01: 0.8541106783994291,
 0.1: 0.8672884832629764,
 1: 0.8682397883346217,
 10: 0.8683445805339199}

### 2B. Same test data set and same features, only predicting on the actual positive instances

In [34]:
#filter for only the postitve instances in the phase 2 set(not positive preds)
X_ph2_sc_20gi_cut2 = X_ph2_sc_20gi_ysize.loc[X_ph2_sc_20gi_ysize['Y_max_new_fire_size_month']>0]


#drop preds and class_size columns, while saving class size as a new training y
Y_ph2_cl_size_cut2 = X_ph2_sc_20gi_cut2['Y_max_new_fire_size_month']
X_ph2_sc_20gi_ready2 = X_ph2_sc_20gi_cut2.drop(columns=['preds','Y_max_new_fire_size_month'])

In [35]:
#run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t2b = {}
aucs_lr_t2b = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_ph2_sc_20gi_ready2, 
                                                                               Y_ph2_cl_size_cut2.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, lr.predict(X_ph3_sc_20gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_ph3_sc_20gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t2b[c] = np.mean(aucs_sub)
    conf_mats_lr_t2b[c] = cm

In [36]:
aucs_lr_t2b

{0.0001: 0.716391295069788,
 0.001: 0.7132585397467713,
 0.01: 0.7113715758026018,
 0.1: 0.69361534388189,
 1: 0.6854233517221757,
 10: 0.6842220176090289}

In [37]:
#run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t2b = {}
aucs_svm_t2b = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_ph2_sc_20gi_ready2, 
                                                                               Y_ph2_cl_size_cut2.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, svm.predict(X_ph3_sc_20gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_ph3_sc_20gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t2b[c] = np.mean(aucs_sub)
    conf_mats_svm_t2b[c] = cm

In [38]:
aucs_svm_t2b

{0.0001: 0.24192557260958222,
 0.001: 0.4725817223337496,
 0.01: 0.6686797575201935,
 0.1: 0.6892585318432574,
 1: 0.6908323190490492,
 10: 0.690975570238528}

# Test 3: Taking SVM model, 30entropy features, 1990-2015

In [39]:
# use 'linSVC_35entropy_1990_2015.pkl' model
# use 'RF_entropy_top35_features.pkl' features

### 3A. Take all instances in the training set that are predicted binary positive and test size classification on the testing set

In [40]:
#1. slim down features
X_tr_sc_35featentr = X_train_scaled[features['RF_entropy_top35_features.pkl']]
X_test_sc_35featentr = X_test_scaled[features['RF_entropy_top35_features.pkl']]

In [41]:
#2. run model on train set to get predictions
y_preds_test3 = models['linSVC_35entropy_1990_2015.pkl'].predict(X_tr_sc_35featentr)

In [42]:
#3. store predictions and filter

#append the these predictions to the dataframe
preds3 = pd.DataFrame(y_preds_test3, columns=['preds'], index=X_tr_sc_35featentr.index)
X_tr_sc_35featentr_preds = X_tr_sc_35featentr.merge(preds3, how='outer', left_index=True, right_index=True)


#rejoin with the y-size column
X_tr_sc_35featentr_ysize = X_tr_sc_35featentr_preds.merge(Y_train_cl_size, how='outer', left_index=True, right_index=True)


#filter for the positive predicted instances
X_tr_sc_35featentr_cut = X_tr_sc_35featentr_ysize.loc[X_tr_sc_35featentr_ysize['preds']==1]


#drop preds and class_size columns, while saving class size as a new training y
Y_train_cl_size_cut3 = X_tr_sc_35featentr_cut['Y_max_new_fire_size_month']
X_tr_sc_35featentr_ready = X_tr_sc_35featentr_cut.drop(columns=['preds','Y_max_new_fire_size_month'])

In [43]:
#4. run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t3 = {}
aucs_lr_t3 = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_tr_sc_35featentr_ready, 
                                                                               Y_train_cl_size_cut3.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, lr.predict(X_test_sc_35featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_test_sc_35featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t3[c] = np.mean(aucs_sub)
    conf_mats_lr_t3[c] = cm

In [44]:
aucs_lr_t3

{0.0001: 0.7810006540222367,
 0.001: 0.8390874903383079,
 0.01: 0.8555978357809619,
 0.1: 0.8600399845412925,
 1: 0.8611064867114572,
 10: 0.8612142517391046}

In [45]:
#5. run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t3 = {}
aucs_svm_t3 = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_tr_sc_35featentr_ready, 
                                                                               Y_train_cl_size_cut3.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, svm.predict(X_test_sc_35featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_test_sc_35featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t3[c] = np.mean(aucs_sub)
    conf_mats_svm_t3[c] = cm

In [46]:
aucs_svm_t3

{0.0001: 0.4024258279326952,
 0.001: 0.7917191866341639,
 0.01: 0.8618534098341162,
 0.1: 0.8765889767524823,
 1: 0.8773195493192224,
 10: 0.877375289850764}

### 3B. Same test data set and same features, only predicting on the actual positive instances

In [47]:
#filter for only the postitve instances (not positive preds)
X_tr_sc_35featentr_cut3 = X_tr_sc_35featentr_ysize.loc[X_tr_sc_35featentr_ysize['Y_max_new_fire_size_month']>0]


#drop preds and class_size columns, while saving class size as a new training y
Y_train_cl_size_cut3b = X_tr_sc_35featentr_cut3['Y_max_new_fire_size_month']
X_tr_sc_35featentr_ready2 = X_tr_sc_35featentr_cut3.drop(columns=['preds','Y_max_new_fire_size_month'])

In [48]:
#run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t3b = {}
aucs_lr_t3b = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_tr_sc_35featentr_ready2, 
                                                                               Y_train_cl_size_cut3b.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, lr.predict(X_test_sc_35featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_test_sc_35featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t3b[c] = np.mean(aucs_sub)
    conf_mats_lr_t3b[c] = cm

In [49]:
aucs_lr_t3b

{0.0001: 0.682612664590677,
 0.001: 0.6941231446501115,
 0.01: 0.7092623255299307,
 0.1: 0.7170433349667262,
 1: 0.718073755591736,
 10: 0.7181774892117035}

In [50]:
#run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t3b = {}
aucs_svm_t3b = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_tr_sc_35featentr_ready2, 
                                                                               Y_train_cl_size_cut3b.to_numpy().ravel())

    cm = confusion_matrix(Y_test_size_arr, svm.predict(X_test_sc_35featentr), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_test_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_test_sc_35featentr)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t3b[c] = np.mean(aucs_sub)
    conf_mats_svm_t3b[c] = cm

In [51]:
aucs_svm_t3b

{0.0001: 0.28688471144270744,
 0.001: 0.5606930591340911,
 0.01: 0.697494981268672,
 0.1: 0.7172033811232474,
 1: 0.7191565369963486,
 10: 0.7193659801147589}

# Test 4: Taking SVM model, 30entropy features, 2005-2015 as testing

In [52]:
# use 'linSVC_30gini_1990_2005.pkl' model
# use 'RF_gini_top30_features_1990_2005.pkl' features
features.keys()

dict_keys(['RF_entropy_top30_features.pkl', 'RF_gini_top20_features_1990_2005.pkl', 'RF_gini_top30_features_1990_2005.pkl', 'RF_entropy_top35_features.pkl'])

In [53]:
#1. slim down features
X_ph1_sc_30gi = X_phase1_scaled[features['RF_gini_top30_features_1990_2005.pkl']]
X_ph2_sc_30gi = X_phase2_scaled[features['RF_gini_top30_features_1990_2005.pkl']]
X_ph3_sc_30gi = X_phase3_scaled[features['RF_gini_top30_features_1990_2005.pkl']]

In [54]:
#2. run model on train set to get predictions
y_preds_test4 = models['linSVC_30gini_1990_2005.pkl'].predict(X_ph2_sc_30gi)

In [55]:
#3. store predictions and filter

#append the these predictions to the dataframe
preds4 = pd.DataFrame(y_preds_test4, columns=['preds'], index=X_ph2_sc_30gi.index)
X_ph2_sc_30gi_preds = X_ph2_sc_30gi.merge(preds4, how='outer', left_index=True, right_index=True)


#rejoin with the y-size column
X_ph2_sc_30gi_ysize = X_ph2_sc_30gi_preds.merge(Y_ph2_cl_size, how='outer', left_index=True, right_index=True)


#filter for the positive predicted instances
X_ph2_sc_30gi_cut = X_ph2_sc_30gi_ysize.loc[X_ph2_sc_30gi_ysize['preds']==1]


#drop preds and class_size columns, while saving class size as a new training y
Y_ph2_cl_size_cut4 = X_ph2_sc_30gi_cut['Y_max_new_fire_size_month']
X_ph2_sc_30gi_ready = X_ph2_sc_30gi_cut.drop(columns=['preds','Y_max_new_fire_size_month'])

In [56]:
#4. run LR on all Cs and store all confusion matrices and AUCs
aucs_lr_t4 = {}
conf_mats_lr_t4 = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_ph2_sc_30gi_ready, 
                                                                               Y_ph2_cl_size_cut4.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, lr.predict(X_ph3_sc_30gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_ph3_sc_30gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t4[c] = np.mean(aucs_sub)
    conf_mats_lr_t4[c] = cm

In [57]:
aucs_lr_t4

{0.0001: 0.6418737737083061,
 0.001: 0.7645557108032581,
 0.01: 0.8519561210535704,
 0.1: 0.8653189844818361,
 1: 0.867006807776919,
 10: 0.8671836910636781}

In [58]:
#5. run SVM on all Cs and store all confusion matrices and AUCs
aucs_svm_t4 = {}
conf_mats_svm_t4 = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_ph2_sc_30gi_ready, 
                                                                               Y_ph2_cl_size_cut4.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, svm.predict(X_ph3_sc_30gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_ph3_sc_30gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t4[c] = np.mean(aucs_sub)
    conf_mats_svm_t4[c] = cm

In [59]:
aucs_svm_t4

{0.0001: 0.22068196682323563,
 0.001: 0.6584339140258042,
 0.01: 0.8714400380522028,
 0.1: 0.8767316725132291,
 1: 0.8771404364112017,
 10: 0.8771731375230395}

### 4B. Same test data set and same features, only predicting on the actual positive instances

In [60]:
#filter for only the postitve instances in the phase 2 set(not positive preds)
X_ph2_sc_30gi_cut4 = X_ph2_sc_30gi_ysize.loc[X_ph2_sc_30gi_ysize['Y_max_new_fire_size_month']>0]


#drop preds and class_size columns, while saving class size as a new training y
Y_ph2_cl_size_cut4 = X_ph2_sc_30gi_cut4['Y_max_new_fire_size_month']
X_ph2_sc_30gi_ready4 = X_ph2_sc_30gi_cut4.drop(columns=['preds','Y_max_new_fire_size_month'])

In [61]:
#run LR on all Cs and store all confusion matrices and AUCs
conf_mats_lr_t4b = {}
aucs_lr_t4b = {}
for c in cs:
    lr = LogisticRegression(C=c, max_iter=1500, class_weight = 'balanced').fit(X_ph2_sc_30gi_ready4, 
                                                                               Y_ph2_cl_size_cut4.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, lr.predict(X_ph3_sc_30gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], lr.predict_proba(X_ph3_sc_30gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_lr_t4b[c] = np.mean(aucs_sub)
    conf_mats_lr_t4b[c] = cm

In [62]:
aucs_lr_t4b

{0.0001: 0.5664725036751339,
 0.001: 0.6385545263424117,
 0.01: 0.7148896274283546,
 0.1: 0.7291159524524604,
 1: 0.7295279231146167,
 10: 0.7294261653731249}

In [63]:
#run SVM on all Cs and store all confusion matrices and AUCs
conf_mats_svm_t4b = {}
aucs_svm_t4b = {}
for c in cs:
    svm = LinearSVC(C=c, class_weight = 'balanced', dual=False).fit(X_ph2_sc_30gi_ready4, 
                                                                               Y_ph2_cl_size_cut4.to_numpy().ravel())

    cm = confusion_matrix(Y_ph3_cl_size_arr, svm.predict(X_ph3_sc_30gi), normalize='true')
    
    #store aucs -- will aggregate after taking an auc for each class -- simple mean
    y = label_binarize(Y_ph3_cl_size_arr, classes=[0,1,2,3])
    n_classes = y.shape[1]
    aucs_sub = np.zeros(n_classes)
    for n in range(n_classes-1):
        fpr, tpr, thresholds = roc_curve(y[:,n], svm.decision_function(X_ph3_sc_30gi)[:,n])
        aucs_sub = auc(fpr,tpr)
    
    aucs_svm_t4b[c] = np.mean(aucs_sub)
    conf_mats_svm_t4b[c] = cm

In [64]:
aucs_svm_t4b

{0.0001: 0.30706831007065744,
 0.001: 0.4863348244629562,
 0.01: 0.6987308932551412,
 0.1: 0.7301187107788123,
 1: 0.7330884561275943,
 10: 0.7333957052305455}