In [119]:
from sklearn import tree
import pandas as pd
import os

# read in csv file and make dataframe

In [120]:
df = pd.read_csv('/Users/marlonschieber/Desktop/weather_api/model_input.csv')
df.head()

Unnamed: 0,date,aqi,precip,humidity,windspeed,winddir,acres_burned,temp_f,cloud_cover
0,1/1/17,18,0.0,75,10,269,0.0,51,51
1,1/2/17,5,0.0,78,7,198,0.0,49,49
2,1/3/17,0,2.4,89,13,149,0.0,54,54
3,1/4/17,10,1.1,90,7,271,0.0,55,55
4,1/5/17,10,0.0,65,5,200,0.0,48,48


# set aqi as target, remove unwanted columns, and set data variable

In [121]:
target = df["aqi"]


In [122]:
data = df.drop("date", axis=1)
data = data.drop("aqi", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,precip,humidity,windspeed,winddir,acres_burned,temp_f,cloud_cover
0,0.0,75,10,269,0.0,51,51
1,0.0,78,7,198,0.0,49,49
2,2.4,89,13,149,0.0,54,54
3,1.1,90,7,271,0.0,55,55
4,0.0,65,5,200,0.0,48,48


# split data and set x and y variables

In [123]:
 from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

# find score of decision tree, and random forest 

In [124]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.03825136612021858

In [125]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.03278688524590164

# list order of importance of variables for tree and forest classifiers

In [126]:
 sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.23739808662108158, 'winddir'),
 (0.20839354025969728, 'humidity'),
 (0.13521720679028848, 'windspeed'),
 (0.1297198437151778, 'cloud_cover'),
 (0.12472005918976264, 'temp_f'),
 (0.10842942354531834, 'acres_burned'),
 (0.05612183987867385, 'precip')]

In [127]:
 sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.21652901436594674, 'winddir'),
 (0.20161107914147688, 'humidity'),
 (0.1470681934247492, 'temp_f'),
 (0.1460900136349943, 'cloud_cover'),
 (0.1296620801456762, 'windspeed'),
 (0.09315167765510049, 'acres_burned'),
 (0.06588794163205605, 'precip')]

# re-attempt model with a single variable to achieve a higher score

In [128]:
data = pd.DataFrame({
    'winddir': df['winddir']
})

feature_names = data.columns 
data.head()

Unnamed: 0,winddir
0,269
1,198
2,149
3,271
4,200


In [129]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [130]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.02185792349726776

# apply filter on dates, to run models on fire seasons only for both years 

In [131]:
#  convert to datetime
df['date'] = pd.to_datetime(df['date'])

start_date = '06/30/2017'
end_date = '12/31/2017'

mask1 = (df['date'] > start_date) & (df['date'] <= end_date)


df1 = df.loc[mask1]
df1


df['date'] = pd.to_datetime(df['date'])

start_date = '06/30/2018'
end_date = '12/31/2018'

mask2 = (df['date'] > start_date) & (df['date'] <= end_date)


df2 = df.loc[mask2]
df2


                                       

Unnamed: 0,date,aqi,precip,humidity,windspeed,winddir,acres_burned,temp_f,cloud_cover
546,2018-07-01,17,0.0,70,6,244,6449.142857,77,77
547,2018-07-02,24,0.0,76,6,244,6449.142857,74,74
548,2018-07-03,54,0.1,73,7,224,6449.142857,73,73
549,2018-07-04,49,0.1,75,7,233,6449.142857,67,67
550,2018-07-05,32,0.0,67,5,258,9904.415584,75,75
551,2018-07-06,40,0.0,55,4,263,9904.415584,81,81
552,2018-07-07,42,0.0,54,6,262,9904.415584,82,82
553,2018-07-08,46,0.0,59,6,247,9904.415584,79,79
554,2018-07-09,44,0.0,53,6,257,9904.415584,83,83
555,2018-07-10,47,0.0,58,6,251,9904.415584,82,82


In [132]:
final_df = pd.concat([df1, df2], ignore_index=True)
final_df

Unnamed: 0,date,aqi,precip,humidity,windspeed,winddir,acres_burned,temp_f,cloud_cover
0,2017-07-01,41,0.1,77,9,240,396.915663,70,70
1,2017-07-02,20,0.1,78,6,225,396.915663,73,73
2,2017-07-03,18,0.0,76,6,248,396.915663,75,75
3,2017-07-04,22,0.0,77,6,248,396.915663,74,74
4,2017-07-05,9,0.0,76,6,252,396.915663,74,74
5,2017-07-06,25,0.0,64,6,252,396.915663,81,81
6,2017-07-07,45,0.0,52,6,251,396.915663,89,89
7,2017-07-08,54,0.0,47,6,243,396.915663,88,88
8,2017-07-09,50,0.0,68,6,241,396.915663,79,79
9,2017-07-10,45,0.0,74,7,237,396.915663,76,76


In [133]:
target = final_df["aqi"]

data = final_df.drop("date", axis=1)
data = data.drop("aqi", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,precip,humidity,windspeed,winddir,acres_burned,temp_f,cloud_cover
0,0.1,77,9,240,396.915663,70,70
1,0.1,78,6,225,396.915663,73,73
2,0.0,76,6,248,396.915663,75,75
3,0.0,77,6,248,396.915663,74,74
4,0.0,76,6,252,396.915663,74,74


# re-attempt models on fire seasons of both years to achieve higher score

In [134]:
 from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [135]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.05434782608695652

In [139]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.24814703554835882, 'winddir'),
 (0.16726012813787416, 'cloud_cover'),
 (0.14576822580118948, 'humidity'),
 (0.1380113067207116, 'acres_burned'),
 (0.13104030963122604, 'temp_f'),
 (0.11579890933489169, 'windspeed'),
 (0.0539740848257483, 'precip')]

In [136]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.06521739130434782

In [137]:
 sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.21652890249992165, 'winddir'),
 (0.19176324385485097, 'humidity'),
 (0.14200388819264892, 'cloud_cover'),
 (0.14037224013265023, 'temp_f'),
 (0.12858214438504578, 'acres_burned'),
 (0.1183288660325556, 'windspeed'),
 (0.062420714902326785, 'precip')]