# About this notebook

This notebook explores classifers for fire occurrence.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [2]:
%%time
df = pd.read_parquet('integratedData.parquet.gz')
df.date = pd.to_numeric(df.date)
df.fire_occurred = df.fire_occurred.astype(int)

Wall time: 692 ms


In [3]:
print(df.shape)
df.head()

(6501450, 20)


Unnamed: 0,date,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa,fire_occurred,acres_burned,fire_name,longitude,latitude
0,946684800000000000,0.0,40.5,0.006,139.7,222.0,2.1,292.0,282.2,31.0,12.3,12.1,48.0,1.8,0.69,0,,,-117.975,33.566667
1,946684800000000000,0.0,40.9,0.00593,136.6,222.0,2.1,291.9,282.2,32.0,12.2,12.0,48.0,1.8,0.7,0,,,-117.933333,33.566667
2,946684800000000000,0.0,41.2,0.0058,133.2,222.0,2.3,292.1,282.1,33.0,11.8,11.5,50.0,1.9,0.72,0,,,-117.891667,33.566667
3,946684800000000000,0.0,40.8,0.00577,129.8,218.0,2.4,292.0,281.4,34.0,11.8,11.4,50.0,1.9,0.69,0,,,-117.85,33.566667
4,946684800000000000,0.0,38.2,0.00547,126.5,218.0,2.5,292.6,281.0,37.0,10.7,10.5,55.0,2.1,0.78,0,,,-117.808333,33.566667


In [4]:
df.fire_occurred.value_counts()

0    6499707
1       1743
Name: fire_occurred, dtype: int64

## Baseline

Fire occurrence is very rare. We'll evaluate using classwise precision and recall. The baseline will be to never predict that a fire occurs.

In [9]:
def pretty_print_results(actual, predicted):
    precision, recall, fscore, support = precision_recall_fscore_support(
        actual,
        predicted,
        zero_division=0.)
    res_df = pd.DataFrame(data={
        "precision": precision,
        "recall": recall,
        "fscore": fscore,
        "support": support,
    }, index=["No fire", "Fire"])
    print(res_df)

In [6]:
shape = (len(df),)
pred = np.zeros(shape)
pretty_print_results(df.fire_occurred, pred)

         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  6499707
Fire      0.000000     0.0  0.000000     1743


  _warn_prf(average, modifier, msg_start, len(result))


# SVM

The following section explores support vector machine (SVM) classifiers.

In [10]:
%%time
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

features = [
 'date',
 'precipitation_amount_mm',
 'relative_humidity_%',
 'specific_humidity_kg/kg',
 'surface_downwelling_shortwave_flux_in_air_W m-2',
 'wind_from_direction_Degrees Clockwise from north',
 'wind_speed_m/s',
 'max_air_temperature_K',
 'min_air_temperature_K',
 'burning_index_g_Unitless',
 'dead_fuel_moisture_100hr_Percent',
 'dead_fuel_moisture_1000hr_Percent',
 'energy_release_component-g_Unitless',
 'potential_evapotranspiration_mm',
 'mean_vapor_pressure_deficit_kPa',
# 'fire_occurred',
# 'acres_burned',
# 'fire_name',
 'longitude',
 'latitude',
]

X, y = df[features], df.fire_occurred
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in tqdm(skf.split(X, y)):
    print("Fitting split...")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = LogisticRegression(random_state=42, verbose=1, solver='sag', n_jobs=8)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pretty_print_results(y_test, y_pred)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Fitting split...


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 12 epochs took 33 seconds


[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   33.5s finished


         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581
Fitting split...


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 8 epochs took 22 seconds


[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   22.1s finished


         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581
Fitting split...


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 15 epochs took 41 seconds


[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   41.5s finished


         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581

Wall time: 1min 46s


### Results

Logistic regression isn't able to predict fire occurrences.

## Decision tree classifier and boosted decision tree classifier

In [13]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

features = [
 'date',
 'precipitation_amount_mm',
 'relative_humidity_%',
 'specific_humidity_kg/kg',
 'surface_downwelling_shortwave_flux_in_air_W m-2',
 'wind_from_direction_Degrees Clockwise from north',
 'wind_speed_m/s',
 'max_air_temperature_K',
 'min_air_temperature_K',
 'burning_index_g_Unitless',
 'dead_fuel_moisture_100hr_Percent',
 'dead_fuel_moisture_1000hr_Percent',
 'energy_release_component-g_Unitless',
 'potential_evapotranspiration_mm',
 'mean_vapor_pressure_deficit_kPa',
# 'fire_occurred',
# 'acres_burned',
# 'fire_name',
 'longitude',
 'latitude',
]

X, y = df[features], df.fire_occurred
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in tqdm(skf.split(X, y)):
    print("Fitting split...")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = DecisionTreeClassifier(random_state=42, max_depth=10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pretty_print_results(y_test, y_pred)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Fitting split...
         precision    recall    fscore  support
No fire   0.999832  0.999967  0.999899  2166569
Fire      0.750865  0.373494  0.498851      581
Fitting split...
         precision    recall    fscore  support
No fire   0.999838  0.999940  0.999889  2166569
Fire      0.638889  0.395869  0.488842      581
Fitting split...
         precision    recall    fscore  support
No fire   0.999828  0.999962  0.999895  2166569
Fire      0.715753  0.359725  0.478809      581

Wall time: 1min 57s


In [16]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

features = [
 'date',
 'precipitation_amount_mm',
 'relative_humidity_%',
 'specific_humidity_kg/kg',
 'surface_downwelling_shortwave_flux_in_air_W m-2',
 'wind_from_direction_Degrees Clockwise from north',
 'wind_speed_m/s',
 'max_air_temperature_K',
 'min_air_temperature_K',
 'burning_index_g_Unitless',
 'dead_fuel_moisture_100hr_Percent',
 'dead_fuel_moisture_1000hr_Percent',
 'energy_release_component-g_Unitless',
 'potential_evapotranspiration_mm',
 'mean_vapor_pressure_deficit_kPa',
# 'fire_occurred',
# 'acres_burned',
# 'fire_name',
 'longitude',
 'latitude',
]

X, y = df[features], df.fire_occurred
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in tqdm(skf.split(X, y)):
    print("Fitting split...")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = DecisionTreeClassifier(max_depth=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pretty_print_results(y_test, y_pred)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Fitting split...
         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581
Fitting split...
         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581
Fitting split...
         precision  recall    fscore  support
No fire   0.999732     1.0  0.999866  2166569
Fire      0.000000     0.0  0.000000      581

Wall time: 26.4 s


In [19]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

features = [
 'date',
 'precipitation_amount_mm',
 'relative_humidity_%',
 'specific_humidity_kg/kg',
 'surface_downwelling_shortwave_flux_in_air_W m-2',
 'wind_from_direction_Degrees Clockwise from north',
 'wind_speed_m/s',
 'max_air_temperature_K',
 'min_air_temperature_K',
 'burning_index_g_Unitless',
 'dead_fuel_moisture_100hr_Percent',
 'dead_fuel_moisture_1000hr_Percent',
 'energy_release_component-g_Unitless',
 'potential_evapotranspiration_mm',
 'mean_vapor_pressure_deficit_kPa',
# 'fire_occurred',
# 'acres_burned',
# 'fire_name',
 'longitude',
 'latitude',
]

X, y = df[features], df.fire_occurred
weak_estimator = DecisionTreeClassifier(max_depth=1)
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in tqdm(skf.split(X, y)):
    print("Fitting split...")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = AdaBoostClassifier(base_estimator=weak_estimator, n_estimators=50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pretty_print_results(y_test, y_pred)




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Fitting split...
         precision    recall    fscore  support
No fire   0.999742  0.999980  0.999861  2166569
Fire      0.328125  0.036145  0.065116      581
Fitting split...
         precision    recall    fscore  support
No fire   0.999741  0.999986  0.999863  2166569
Fire      0.392157  0.034423  0.063291      581
Fitting split...
         precision    recall    fscore  support
No fire   0.999738  0.999981  0.999859  2166569
Fire      0.250000  0.024096  0.043956      581

Wall time: 21min 49s
