In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.metrics import auc, roc_curve, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import scale, label_binarize

In [3]:
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
from shapely.geometry import Point, Polygon, MultiPolygon  # for manipulating text data into geospatial shapes
from shapely import wkt  # stands for "well known text," allows for interchange across GIS programs
import rtree  # supports geospatial join
import os
import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import descartes
import sys
import sklearn
from shapely.ops import nearest_points
from datetime import datetime as dt, date
sys.path.append('C:/Users/jades/1001 Intro to Data Science Notebooks/Project/wildfires-1001/code/functions/')
from gis_processing import *
import pickle

In [4]:
git_dir = 'C:/Users/jades/1001 Intro to Data Science Notebooks/Project/wildfires-1001/'

In [5]:
weather_rename_dict = pd.read_pickle(os.path.join(git_dir, 'data/clean_data/ERA_weather-data/ERA_rename_dictionary.pkl'))
mod_lr = pd.read_pickle(os.path.join(git_dir, 'models/LR_30entropy_1990_2015.pkl'))
mod_svm = pd.read_pickle(os.path.join(git_dir, 'models/linSVC_30entropy_1990_2015_a.pkl'))
feat_list = pd.read_pickle(os.path.join(git_dir, 'models/feature_lists/RF_entropy_top30_features.pkl'))

In [6]:
target_df1 = pd.read_pickle(os.path.join(git_dir, 'data/clean_data/target_df_final_1123_newtargets_1.pkl'))
target_df2 = pd.read_pickle(os.path.join(git_dir, 'data/clean_data/target_df_final_1123_newtargets_2.pkl'))
target_df = target_df1.append(target_df2)
target_df.rename(columns = weather_rename_dict, inplace = True)

In [7]:
non_mod_cols = ['GRID_ID','month_id','MONTH','COUNTYFP','COUNTY_AREA', 'NAME','GRID_AREA','COUNTY_ARE','month_id_old_x','month_id_old_y',
                'geometry','Fire_area','total_fire_days','hist_p_time_1y','total_fire_days','hist_p_time_1y', 
                'hist_p_time_1m', 'month_id_old', 'YEAR']
Y_cols = ['Y_bin', 'Y_fire_count', 'Y_fire_area_prop', 'Y_fire_class_size','Y_bin_new_fire_month',
          'Y_max_new_fire_size_month','Y_count_new_fires_month']

## Binary Target Variable

In [8]:
train_stage1_data = target_df[(target_df['YEAR']>1989) & (target_df['YEAR']<=2005)]
X_train_stage1 = train_stage1_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols:
    try:
        X_train_stage1.drop(y, inplace = True, axis =1)
    except:
        pass
Y_train_stage1_bin = train_stage1_data['Y_bin_new_fire_month'].to_frame()
Y_train_stage1_area = train_stage1_data['Y_fire_area_prop'].to_frame()

In [9]:
train_stage2_data = target_df[(target_df['YEAR']>2005) & (target_df['YEAR']<=2016)]
X_train_stage2 = train_stage2_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols:
    try:
        X_train_stage2.drop(y, inplace = True, axis =1)
    except:
        pass
Y_train_stage2_bin = train_stage2_data['Y_bin_new_fire_month'].to_frame()
Y_train_stage2_area = train_stage2_data['Y_fire_area_prop'].to_frame() 

In [10]:
test_data = target_df[target_df['YEAR']>2016]
X_test = test_data.drop('YEAR', axis = 1)
for y in Y_cols + non_mod_cols:
    try:
        X_test.drop(y, inplace = True, axis =1)
    except:
        pass
Y_test_bin = test_data['Y_bin_new_fire_month'].to_frame()
Y_test_area = test_data['Y_fire_area_prop'].to_frame()

In [11]:
X_train_stage1_scaled = pd.DataFrame(scale(X_train_stage1), columns = X_train_stage1.columns)
X_train_stage2_scaled = pd.DataFrame(scale(X_train_stage2), columns = X_train_stage2.columns)
X_test_scaled = pd.DataFrame(scale(X_test), columns = X_test.columns)

## Create Classification Model Using First Training Set

In [12]:
mod_lr = LogisticRegression(C=0.001, class_weight='balanced')
mod_lr.fit(X_train_stage1_scaled[feat_list], Y_train_stage1_bin['Y_bin_new_fire_month'])

LogisticRegression(C=0.001, class_weight='balanced')

In [14]:
#mod_lr = LinearSVC(C=0.0001, class_weight='balanced', dual=False)
#mod_lr.fit(X_train_stage1_scaled[feat_list], Y_train_stage1_bin['Y_bin_new_fire_month'])

LinearSVC(C=0.0001, class_weight='balanced', dual=False)

In [16]:
with open(os.path.join(git_dir, 'models/linSVC_30entropy_1990_2005.pkl'), 'wb') as handle:
    pickle.dump(mod_lr, handle)

## Generate Input for Regression Model Using Second Training Set

In [91]:
preds_stage1 = mod_lr.predict(X_train_stage2_scaled[feat_list])

In [92]:
X_train_stage2['preds_stage1'] = preds_stage1
Y_train_stage2_area['preds_stage1'] = preds_stage1

X_train_stage2 = X_train_stage2[X_train_stage2['preds_stage1'] == 1]
Y_train_stage2_area = Y_train_stage2_area[Y_train_stage2_area['preds_stage1'] == 1]

## Create Regression Model Using Filtered Second Training Set

In [93]:
# Fit model
mod_linr = LinearRegression()
mod_linr.fit(X_train_stage2[feat_list], Y_train_stage2_area['Y_fire_area_prop'])

LinearRegression()

## Generate Input for Regression Model Using Test Set

In [94]:
# Predict using stage 1 model to identify positive instances
preds_stage1 = mod_lr.predict(X_test_scaled[feat_list])

In [95]:
# Create a filtered test df of just the instances flagged as positive from the stage 1 model
X_test['preds_stage1'] = preds_stage1
Y_test_area['preds_stage1'] = preds_stage1

X_test_filtered = X_test[X_test['preds_stage1'] == 1]
Y_test_filtered_area = Y_test_area[Y_test_area['preds_stage1'] == 1]

## Score Regression Model on Filtered Test and Full Test

In [98]:
# Score on the instances flagged as positive from the stage 1 model
preds_stage2 = mod_linr.predict(X_test_filtered[feat_list])
r2_score(Y_test_filtered_area['Y_fire_area_prop'], preds_stage2)

0.3132056335984441

In [83]:
# Join our predictions back on the full test df and null fill with 0s (we predict an area of 0 if the stage 1 model predicts 0)
X_test_filtered['preds_stage2'] = preds_stage2

X_test = X_test.merge(X_test_filtered['preds_stage2'], how='left', left_index=True, right_index=True)
X_test['preds_stage2'] = X_test['preds_stage2'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_filtered['preds_stage2'] = preds_stage2


In [84]:
# Score on all test instances
r2_score(Y_test_area['Y_fire_area_prop'], X_test['preds_stage2'])

0.23220533515415043

In [None]:
# Iterate
# 1. Use different train data set
# 2. Use random forest
# 3. Iterate on linear regression parameters
# 4. Iterate on features used

## Results

In [None]:
# Stage 1: LR, Stage 2: Linear Reg, Train: 1990-2015
# Filtered test preds: 0.3499442581817819
# Full test preds: 0.2542595355564937

# Stage 1: LR, Stage 2: Linear Reg, Train: 1990-2005
# Filtered test preds: 0.2845267658849393
# Full test preds: 0.17590153864987512

# Stage 1: SVM, Stage 2: Linear Reg, Train: 1990-2015
# Filtered test preds: 0.43705572076979404
# Full test preds: 0.3723310731845757

# Stage 1: SVM, Stage 2: Linear Reg, Train: 1990-2005
# Filtered test preds: 0.26957720030158305
# Full test preds: 0.16976467996687516