### Notebook To Save Test and Train data
Because we are using synthentic data it's important to not mix the training 
data with the final test data. 

In [17]:
import os
import collections
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from imblearn import under_sampling, over_sampling
from imblearn.under_sampling import RandomUnderSampler

### Function to split DataFrame and save into train test csv
save 80% to train and 20% to test

In [44]:
def save_train_test_db(df_save):

    msk = np.random.rand(len(df_save)) < 0.8

    print(len(df_save))
    print(np.random.rand(len(df_save)))
    
    train = df_save[msk]

    test = df_save[~msk]
    
    train.to_csv('data/train.csv', index=False)
    test.to_csv('data/test.csv', index=False)

### Reduce the sample number to 120K moderate fires
This to ensure more balance bin classes since we had
1.3 M moderatesand 17K severe

In [45]:
def smote_data(X2, y2):
    
    under_range = {'Moderate': 120000} #recommend hyperparameter range - 150-200k
    under = RandomUnderSampler(sampling_strategy=under_range)
    X2, y2 = under.fit_resample(X2,y2)
    
    return X2, y2

### Setup the X and y to smote

In [46]:
def find_X_y(df_fires1):

    #
    features = ['latitude', 'longitude', 'doy','fuelcode', 'fuel_moisture_class', 'prefire_fuel', 'temperature', 'humidity', 'precip_intensity', 'wind_gust',
                'wind_speed', 'fire_intensity_twocat']

    # four categories using balanced binning
    #target = ['fire_intensity_fourcat']
    target = ['fire_intensity_twocat']
    
    y = df_fires1[target]
    y = np.ravel(y)
    X = df_fires1[features]
    
    # Not really needed already 1-4 numerical
    #y = LabelEncoder().fit_transform(y) # Label-encode targets before modeling
    
    return X, y

### Read the orginal data files and dropped columns not needed.

In [47]:
df_fires = pd.read_csv('data/FireIntensity_Model_June12_Clean.csv').sample(1328922)

print(df_fires.shape)

(1328922, 26)


In [48]:
df_fires = df_fires.drop(columns={'Unnamed: 0','cluster_reference', 'fire_region', 'season', 
                                  'month', 'year', 'brightness', 'bright_t31','frp', 'visibility', 'covertype',
                                  'fire_intensity', 'fire_intensity_threecat', 'fire_intensity_fourcat'})

### Setup the features to be saved

In [49]:
X1, y1 = find_X_y(df_fires)

### Reduce the dataset to be more balanced.

In [50]:
X1, y1 = smote_data(X1,y1)

In [52]:
X1.shape

(137493, 12)

### Save the train test splits to disk

In [41]:
save_train_test_db(X1)

137493
[0.30465374 0.72750312 0.57353408 ... 0.60303146 0.64692706 0.25365399]
