In [29]:
# For now, clear outputs (Cell > All Output > Clear) before committing to Git
# There might be a better way

import sqlite3
import pandas as pd
cnx = sqlite3.connect('FPA_FOD_20170508.sqlite')

import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree, preprocessing
import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [106]:
df = pd.read_sql_query("SELECT FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIRE_SIZE_CLASS FROM 'Fires'", cnx)
print(df.head())

   FIRE_YEAR DISCOVERY_TIME STAT_CAUSE_DESCR  CONT_DATE CONT_TIME   LATITUDE  \
0       2005           1300    Miscellaneous  2453403.5      1730  40.036944   
1       2004           0845        Lightning  2453137.5      1530  38.933056   
2       2004           1921   Debris Burning  2453156.5      2024  38.984167   
3       2004           1600        Lightning  2453189.5      1400  38.559167   
4       2004           1600        Lightning  2453189.5      1200  38.559167   

    LONGITUDE STATE  DISCOVERY_DATE  FIRE_SIZE FIRE_SIZE_CLASS  
0 -121.005833    CA       2453403.5       0.10               A  
1 -120.404444    CA       2453137.5       0.25               A  
2 -120.735556    CA       2453156.5       0.10               A  
3 -119.913333    CA       2453184.5       0.10               A  
4 -119.933056    CA       2453184.5       0.10               A  


In [31]:
df['DATE'] = pd.to_datetime(df['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
print(df.head())

   FIRE_YEAR STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE  DISCOVERY_DATE  \
0       2005    Miscellaneous  40.036944 -121.005833    CA       2453403.5   
1       2004        Lightning  38.933056 -120.404444    CA       2453137.5   
2       2004   Debris Burning  38.984167 -120.735556    CA       2453156.5   
3       2004        Lightning  38.559167 -119.913333    CA       2453184.5   
4       2004        Lightning  38.559167 -119.933056    CA       2453184.5   

   FIRE_SIZE       DATE  
0       0.10 2005-02-02  
1       0.25 2004-05-12  
2       0.10 2004-05-31  
3       0.10 2004-06-28  
4       0.10 2004-06-28  


In [32]:
df['MONTH'] = pd.DatetimeIndex(df['DATE']).month
df['DAY'] = pd.DatetimeIndex(df['DATE']).day
df['DAY_OF_WEEK'] = df['DATE'].dt.weekday_name
print(df.head())

   FIRE_YEAR STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE  DISCOVERY_DATE  \
0       2005    Miscellaneous  40.036944 -121.005833    CA       2453403.5   
1       2004        Lightning  38.933056 -120.404444    CA       2453137.5   
2       2004   Debris Burning  38.984167 -120.735556    CA       2453156.5   
3       2004        Lightning  38.559167 -119.913333    CA       2453184.5   
4       2004        Lightning  38.559167 -119.933056    CA       2453184.5   

   FIRE_SIZE       DATE  MONTH  DAY DAY_OF_WEEK  
0       0.10 2005-02-02      2    2   Wednesday  
1       0.25 2004-05-12      5   12   Wednesday  
2       0.10 2004-05-31      5   31      Monday  
3       0.10 2004-06-28      6   28      Monday  
4       0.10 2004-06-28      6   28      Monday  


In [33]:
le = preprocessing.LabelEncoder()
df['STAT_CAUSE_DESCR'] = le.fit_transform(df['STAT_CAUSE_DESCR'])
df['STATE'] = le.fit_transform(df['STATE'])
df['DAY_OF_WEEK'] = le.fit_transform(df['DAY_OF_WEEK'])
print(df.head())

   FIRE_YEAR  STAT_CAUSE_DESCR   LATITUDE   LONGITUDE  STATE  DISCOVERY_DATE  \
0       2005                 7  40.036944 -121.005833      4       2453403.5   
1       2004                 6  38.933056 -120.404444      4       2453137.5   
2       2004                 3  38.984167 -120.735556      4       2453156.5   
3       2004                 6  38.559167 -119.913333      4       2453184.5   
4       2004                 6  38.559167 -119.933056      4       2453184.5   

   FIRE_SIZE       DATE  MONTH  DAY  DAY_OF_WEEK  
0       0.10 2005-02-02      2    2            6  
1       0.25 2004-05-12      5   12            6  
2       0.10 2004-05-31      5   31            1  
3       0.10 2004-06-28      6   28            1  
4       0.10 2004-06-28      6   28            1  


In [34]:
labels=df['FIRE_SIZE']
labels.head()

0    0.10
1    0.25
2    0.10
3    0.10
4    0.10
Name: FIRE_SIZE, dtype: float64

In [35]:
logits=df.drop(['FIRE_SIZE','DATE','DISCOVERY_DATE'],axis=1)
logits.head()

Unnamed: 0,FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,MONTH,DAY,DAY_OF_WEEK
0,2005,7,40.036944,-121.005833,4,2,2,6
1,2004,6,38.933056,-120.404444,4,5,12,6
2,2004,3,38.984167,-120.735556,4,5,31,1
3,2004,6,38.559167,-119.913333,4,6,28,1
4,2004,6,38.559167,-119.933056,4,6,28,1


In [36]:
x_train, x_test, y_train, y_test = train_test_split(logits,labels,test_size=0.2)

In [55]:
reg = LinearRegression().fit(x_train, y_train)

In [56]:
reg.score(x_test,y_test)

0.0026156352077382206

In [41]:
import tensorflow
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation

In [64]:
from keras.activations import relu

In [101]:
model=Sequential()
model.add(Dense(50,input_shape=[8]))
model.add(Activation('relu'))
model.add(Dense(1))

In [102]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 50)                450       
_________________________________________________________________
activation_6 (Activation)    (None, 50)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 51        
Total params: 501
Trainable params: 501
Non-trainable params: 0
_________________________________________________________________


In [103]:
model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

In [104]:
model.fit(x_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a31b50908>

In [97]:
model.evaluate(x_test,y_test)



[5554605.522218565, 0.0]

In [98]:
a=model.predict(x_test)

In [100]:
a[100]

array([0.], dtype=float32)

In [None]:
df['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral')
plt.show()

In [None]:
df_arson = df[df['STAT_CAUSE_DESCR']=='Arson']
df_arson['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

In [None]:
def plot_corr(df,size=10):
    corr = df.corr()  #the default method is pearson
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr,cmap=plt.cm.Oranges)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)    
    plt.show()
    

    
plot_corr(df)

In [None]:
df.plot(kind='scatter',x='LONGITUDE',y='LATITUDE',color='coral',alpha=0.3)
plt.show()

In [None]:
df_lightning = df[df['STAT_CAUSE_DESCR']=='Lightning']
df_lightning['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

In [None]:
df['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

In [None]:
df['STATE'].value_counts().head(n=10).plot(kind='barh',color='coral')
plt.show()

In [None]:
df_CA = df[df['STATE']=='CA']
df_CA['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral',title='causes of fires for CA')
plt.show()