# Modeling

## Marta ---------------------------------------------------------

put all of your code between here and the next person's name only

In [38]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [32]:
noaa_on_fire = pd.read_csv('data/DONT_PUSH_mfi_df_yr_trail.csv')

In [34]:
noaa_on_fire = pd.get_dummies(noaa_on_fire, columns=['state', 'month'], drop_first= True)

In [35]:
noaa_on_fire.columns

Index(['Unnamed: 0', 'stat_cause_descr', 'fire_size', 'fire_size_class',
       'cont_date_fixed', 'disc_date_fixed', 'time_burnt', 'year', 'yearmonth',
       'pcp', 'tavg', 'pdsi', 'phdi', 'zndx', 'pmdi', 'cdd', 'hdd', 'sp02',
       'sp03', 'sp06', 'sp09', 'sp12', 'sp24', 'tmin', 'tmax', 'tavg_t12m',
       'tavg_t9m', 'tavg_t6m', 'tavg_t3m', 'pcp_t12m', 'pcp_t9m', 'pcp_t6m',
       'pcp_t3m', 'pmdi_t12m', 'pmdi_t9m', 'pmdi_t6m', 'pmdi_t3m', 'pdsi_t12m',
       'pdsi_t9m', 'pdsi_t6m', 'pdsi_t3m', 'state_CA', 'state_CO', 'state_ID',
       'state_MT', 'state_NM', 'state_NV', 'state_OR', 'state_UT', 'state_WA',
       'state_WY', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object')

In [48]:
y = noaa_on_fire['fire_size']
X_vars = ['state_CA', 'state_CO', 'state_ID',
       'state_MT', 'state_NM', 'state_NV', 'state_OR', 'state_UT', 'state_WA',
       'state_WY', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12','tavg_t3m', 'pcp_t12m', 'pmdi_t12m']
X = noaa_on_fire[X_vars]

In [43]:
noaa_on_fire.dropna(subset = X_vars, inplace=True)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

stan = StandardScaler()
X_train = stan.fit_transform(X_train)
X_test = stan.transform(X_test)

In [66]:
lr = LinearRegression()
lasso = Lasso()

In [67]:
for q in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    print(f"Using data corresponding to fire size > {q} quantile:")
    noaa_on_fire = noaa_on_fire[noaa_on_fire['fire_size']> noaa_on_fire['fire_size'].quantile(q)]
    X_train, X_test, y_train, y_test = train_test_split(X,y)

    stan = StandardScaler()
    X_train = stan.fit_transform(X_train)
    X_test = stan.transform(X_test)
    
    lr = LinearRegression()
    lasso = Lasso()
    
    lr.fit(X_train, y_train)
    lasso.fit(X_train, y_train)
    print(f"Linear Regression achieves R2 of {round(lr.score(X_train, y_train),4)} on train data and {round(lr.score(X_test, y_test),4)} on test data.")
    print(f"Lasso Regression achieves R2 of {round(lasso.score(X_train, y_train),4)} on train data and {round(lasso.score(X_test, y_test),4)} on test data.\n")

Using data corresponding to fire size > 0.1 quantile:
Linear Regression achieves R2 of 0.0015 on train data and 0.0007 on test data.
Lasso Regression achieves R2 of 0.0015 on train data and 0.0007 on test data.

Using data corresponding to fire size > 0.15 quantile:
Linear Regression achieves R2 of 0.0014 on train data and 0.0007 on test data.
Lasso Regression achieves R2 of 0.0014 on train data and 0.0007 on test data.

Using data corresponding to fire size > 0.2 quantile:
Linear Regression achieves R2 of 0.0014 on train data and 0.0008 on test data.
Lasso Regression achieves R2 of 0.0014 on train data and 0.0008 on test data.

Using data corresponding to fire size > 0.25 quantile:
Linear Regression achieves R2 of 0.0011 on train data and 0.0016 on test data.
Lasso Regression achieves R2 of 0.0011 on train data and 0.0016 on test data.

Using data corresponding to fire size > 0.3 quantile:
Linear Regression achieves R2 of 0.0012 on train data and 0.0014 on test data.
Lasso Regression 

## Jesse ---------------------------------------------------------

put all of your code between here and the next person's name only

## CM ---------------------------------------------------------

put all of your code between here and the next person's name only

## Kira ---------------------------------------------------------

put all of your code between here and the next person's name only

In [1]:
# Import libraries.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [2]:
mfi_df = pd.read_csv('data/mfi_df_yr.csv')
mfi_df.head()

Unnamed: 0.1,Unnamed: 0,fire_year,stat_cause_descr,fire_size,fire_size_class,state,cont_date_fixed,disc_date_fixed,time_burnt_fixed,time_burnt,...,cdd,hdd,sp02,sp03,sp06,sp09,sp12,sp24,tmin,tmax
0,0,2005,Miscellaneous,0.1,A,CA,2005-02-02,2005-02-02,0 days 00:00:00.000000000,0.0,...,0,421,0.58,0.85,1.0,0.92,0.24,0.48,38.1,56.1
1,1,2004,Lightning,0.25,A,CA,2004-05-12,2004-05-12,0 days 00:00:00.000000000,0.0,...,56,126,-1.19,-1.86,-0.05,-0.32,-0.31,-0.04,49.2,77.2
2,2,2004,Debris Burning,0.1,A,CA,2004-05-31,2004-05-31,0 days 00:00:00.000000000,0.0,...,56,126,-1.19,-1.86,-0.05,-0.32,-0.31,-0.04,49.2,77.2
3,3,2004,Lightning,0.1,A,CA,2004-07-03,2004-06-28,5 days 00:00:00.000000000,5.0,...,115,44,-1.1,-1.39,-0.61,-0.32,-0.29,-0.04,55.6,85.2
4,4,2004,Lightning,0.1,A,CA,2004-07-03,2004-06-28,5 days 00:00:00.000000000,5.0,...,115,44,-1.1,-1.39,-0.61,-0.32,-0.29,-0.04,55.6,85.2


In [3]:
mfi_df.columns

Index(['Unnamed: 0', 'fire_year', 'stat_cause_descr', 'fire_size',
       'fire_size_class', 'state', 'cont_date_fixed', 'disc_date_fixed',
       'time_burnt_fixed', 'time_burnt', 'month', 'year', 'statecode',
       'division', 'yearmonth', 'pcp', 'tavg', 'pdsi', 'phdi', 'zndx', 'pmdi',
       'cdd', 'hdd', 'sp02', 'sp03', 'sp06', 'sp09', 'sp12', 'sp24', 'tmin',
       'tmax'],
      dtype='object')

In [4]:
list(mfi_df.columns.values)

['Unnamed: 0',
 'fire_year',
 'stat_cause_descr',
 'fire_size',
 'fire_size_class',
 'state',
 'cont_date_fixed',
 'disc_date_fixed',
 'time_burnt_fixed',
 'time_burnt',
 'month',
 'year',
 'statecode',
 'division',
 'yearmonth',
 'pcp',
 'tavg',
 'pdsi',
 'phdi',
 'zndx',
 'pmdi',
 'cdd',
 'hdd',
 'sp02',
 'sp03',
 'sp06',
 'sp09',
 'sp12',
 'sp24',
 'tmin',
 'tmax']

In [5]:
# Drop the `Unnamed: 0` column.
mfi_df.drop('Unnamed: 0', axis=1, inplace=True)

# Drop NAs.
# df.dropna(inplace=True)

In [6]:
# Create dummies for the `ChestPain`, `Thal`, and `AHD` columns.
# Be sure to set `drop_first=True`.
mfi_df = pd.get_dummies(mfi_df,
                    columns=['stat_cause_descr', 'state'],
                    drop_first=True)

In [7]:
mfi_df.drop(columns=['fire_year', 'cont_date_fixed', 
                        'disc_date_fixed', 'time_burnt', 
                        'statecode',
                        'division', 'yearmonth'],
                        axis = 1,
                        inplace = True)

In [8]:
mfi_df.head()

Unnamed: 0,fire_size,fire_size_class,time_burnt_fixed,month,year,pcp,tavg,pdsi,phdi,zndx,...,state_CA,state_CO,state_ID,state_MT,state_NM,state_NV,state_OR,state_UT,state_WA,state_WY
0,0.1,A,0 days 00:00:00.000000000,2,2005,4.03,47.1,2.28,2.28,0.56,...,1,0,0,0,0,0,0,0,0,0
1,0.25,A,0 days 00:00:00.000000000,5,2004,0.45,63.2,-2.15,-2.15,-1.89,...,1,0,0,0,0,0,0,0,0,0
2,0.1,A,0 days 00:00:00.000000000,5,2004,0.45,63.2,-2.15,-2.15,-1.89,...,1,0,0,0,0,0,0,0,0,0
3,0.1,A,5 days 00:00:00.000000000,6,2004,0.08,70.4,-2.8,-2.8,-2.62,...,1,0,0,0,0,0,0,0,0,0
4,0.1,A,5 days 00:00:00.000000000,6,2004,0.08,70.4,-2.8,-2.8,-2.62,...,1,0,0,0,0,0,0,0,0,0


In [14]:
# Define X and y.
X = mfi_df.drop(columns=['fire_size']).select_dtypes(include=['float64'])
y = mfi_df['fire_size'].to_numeric()

AttributeError: 'Series' object has no attribute 'to_numeric'

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                   )

In [None]:
# What is the accuracy of our baseline model?
y.value_counts(normalize=True)

In [None]:
# Instantiate `RandomForestClassifier` object.
rf = RandomForestClassifier(n_estimators=100)
et = ExtraTreesClassifier(n_estimators=100)

rf.fit(X_train, y_train)

In [None]:
# Fit and score on the training data.
cross_val_score(rf, X_train, y_train, cv=5).mean()

In [None]:
# Fit and score on the training data.
cross_val_score(et, X_train, y_train, cv=5).mean()

In [None]:
# Score on the testing data.
tree.score(X_test, y_test)

In [None]:
# Generate one bootstrapped sample
# of size n from X_train.

X_train.sample(n = X_train.shape[0],
               replace = True,
               random_state = 42)

def bootstrap(data, num_B):
    # Create empty list for output.
    output = []
    
    # Generate num_B bootstrapped samples.
    for B in range(num_B):
        
        # Each sample is sampled from data with
        # the same sample size as the original
        # data, and samples with replacement.
        bootstrapped_sample = data.sample(n = data.shape[0],
                                        replace = True)
        
        # Append sample to list.
        output.append(bootstrapped_sample)
        
    # Returns num_B bootstrapped samples in list.
    return output

# Adapted from Boom D. - NYC


In [None]:
# Set seed for reproducibility.
np.random.seed(42)

# Generate five bootstrapped samples from X_train.
boot_samp = bootstrap(X_train, 5)

# Plot cholesterol level for each bootstrapped sample.
for B in range(5):
    plt.figure(figsize = (9,6))
    plt.hist(boot_samp[B]['Chol'])
    plt.title(f'Bootstrapped Sample {B + 1} of Cholesterol Level');