In [9]:
import zipfile
import tensorflow as tf
import pandas as pd
import numpy as np
import skimage
from scipy.ndimage import distance_transform_edt
import warnings
warnings.filterwarnings('ignore')
import gc
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV
from time import time
import matplotlib.pyplot as plt
import pickle as pk
import xgboost as xgb
from sklearn.linear_model import SGDClassifier

In [6]:
# data variables

INPUT_FEATURES = ['elevation', 'th', 'vs',  'tmmn', 'tmmx', 'sph',
                  'pr', 'pdsi', 'NDVI', 'population', 'erc', 'PrevFireMask']

OUTPUT_FEATURES = ['FireMask']


# underlying feature value ranges:
# (min_clip, max_clip, mean, standard deviation)

feature_description_dict = {
    # Elevation in m: between 0.1 percentile and 99.9 percentile
    'elevation': (0.0, 3141.0, 657.3003, 649.0147),

    # Palmer Drought Severity Index: between 0.1 percentile and 99.9 percentile
    'pdsi': (-6.12974870967865, 7.876040384292651, -0.0052714925, 2.6823447),

    #Vegetation index times 10,000: between -1 and 1
    'NDVI': (-9821.0, 9996.0, 5157.625, 2466.6677),

    # Precipitation in mm: between 0.0 and 99.9 percentile
    'pr': (0.0, 44.53038024902344, 1.7398051, 4.482833),

    # Specific humidity: between 0 and 1
    'sph': (0., 1., 0.0071658953, 0.0042835088),

    # Wind direction in degrees clockwise from north: between 0 and 360.
    'th': (0., 360.0, 190.32976, 72.59854),

    #Min temp: between 253.15 kelvin and 99.9 percentile
    'tmmn': (253.15, 298.94891357421875, 281.08768, 8.982386),

    #Max temp: between 253.15 kelvin and 99.9 percentile
    'tmmx': (253.15, 315.09228515625, 295.17383, 9.815496),

    # Wind speed in m/s: between 0. and 99.9 percentile
    'vs': (0.0, 10.024310074806237, 3.8500874, 1.4109988),

    # NFDRS fire danger index energy release component BTU's per square foot.
    # 0., 99.9 percentile
    'erc': (0.0, 106.24891662597656, 37.326267, 20.846027),

    # Population density: between 0 and 99.9 percentile
    'population': (0., 2534.06298828125, 25.531384, 154.72331),

    # We don't want to normalize the FireMasks.
    # 1 indicates fire, 0 no fire, -1 unlabeled data
    'PrevFireMask': (-1., 1., 0., 1.),
    'FireMask': (-1., 1., 0., 1.)
}

In [4]:
def check_file(filename, CHUNK_SIZE):
  to_break = 0

  with pd.read_csv(filename,chunksize=CHUNK_SIZE) as reader:
    reader
    for i,chunk in enumerate(reader):
      print("Loading chunk",i)
      DATA = chunk
      for col in DATA.columns:
        #add other checks for verify data
        if len([x for x in DATA[col] if type(x) == str]) != 0:
          print("While loading chunk", i ,"column", col ,"has incorrect values")
          print("such as", [x for x in DATA[col] if type(x) == str])
          to_break = 1
          break
        if col in feature_description_dict.keys():
          if min(DATA[col]) < feature_description_dict[col][0] or max(DATA[col]) < feature_description_dict[col][1]:
            print("Data for column ",col,"is out of appropiate bounds - cleaning and concat went wrong")
            to_break = 1
            break
      if to_break == 1:
       break
    del DATA
    del chunk
del reader

In [3]:
#grab about 1000 images per batch
CHUNK_SIZE = 64*64*1000

In [32]:
check_file('eval.csv',CHUNK_SIZE=CHUNK_SIZE)

Loading chunk 0
Loading chunk 1
Loading chunk 2
Loading chunk 3


In [33]:
check_file('test.csv',CHUNK_SIZE=CHUNK_SIZE)

Loading chunk 0
Loading chunk 1
Loading chunk 2
Loading chunk 3


In [9]:
%timeit
check_file('train.csv',CHUNK_SIZE=CHUNK_SIZE)

Loading chunk 0


KeyboardInterrupt: 

In [4]:
#use this to get visual of component estimation for PCA
#input is minmaxscaled training X data
def pca_comp_graph(scaled_df):
    pca = PCA().fit(scaled_df)
    
    %matplotlib inline
    
    plt.rcParams["figure.figsize"] = (12,6)

    fig, ax = plt.subplots()
    xi = np.arange(1, scaled_df.shape[1]+1, step=1)
    y = np.cumsum(pca.explained_variance_ratio_)

    plt.ylim(0.0,1.1)
    plt.plot(xi, y, marker='o', linestyle='--', color='b')

    plt.xlabel('Number of Components')
    plt.xticks(np.arange(0, scaled_df.shape[1]+1, step=1)) #change from 0-based array index to 1-based human-readable label
    plt.ylabel('Cumulative variance (%)')
    plt.title('The number of components needed to explain variance')

    plt.axhline(y=0.95, color='r', linestyle='-')
    plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

    ax.grid(axis='x')
    return plt.show()

In [47]:
with pd.read_csv('train_data.csv',chunksize=10000000,header=0) as reader:
    for i,chunk in enumerate(reader):
        print(chunk['FireMask'].value_counts())
        break

 0.0    9639639
-1.0     243180
 1.0     117181
Name: FireMask, dtype: int64


The following takes about 10 min and should reach 14 chunks

In [14]:
%%time
pca_batch = IncrementalPCA(n_components=10)
scaler = MinMaxScaler()
break_check = 0
chunksize = 64*64*1000
sample_presize = 64*64*200
#grab 200 images from each chunk to create random sampled data
small_batch = None
with pd.read_csv('train.csv',chunksize=chunksize,header=0) as reader:
  reader
  for i,chunk in enumerate(reader):
    print("Loading chunk",i)
    DATA = chunk

    X = DATA.loc[:, DATA.columns != 'FireMask']
    X_rescaled = scaler.fit_transform(X)
    pca_batch.partial_fit(X_rescaled)

    if i == 0:
        small_batch = pd.DataFrame(DATA.sample(n = sample_presize))
    else:
      if sample_presize <= len(chunk):
          small_batch = pd.concat([small_batch,DATA.sample(n=sample_presize)],ignore_index=True)
      else:
          small_batch = pd.concat([small_batch,DATA],ignore_index=True)
    #really trying to keep mem low
  del DATA
  del chunk
  del X
  del X_rescaled

del reader
        

Loading chunk 0
Loading chunk 1
Loading chunk 2
Loading chunk 3
Loading chunk 4
Loading chunk 5
Loading chunk 6
Loading chunk 7
Loading chunk 8
Loading chunk 9
Loading chunk 10
Loading chunk 11
Loading chunk 12
Loading chunk 13
Loading chunk 14
Wall time: 5min 48s


In [15]:
#External save incremental pca model - made with 10 components
pk.dump(pca_batch, open("pca.pkl","wb"))

In [50]:
small_batch[['PrevFireMask']].value_counts()

PrevFireMask
0.0             8746312
1.0             1531248
2.0              773858
3.0              442558
4.0              268171
                 ...   
250.0                 2
253.0                 2
243.0                 1
245.0                 1
242.0                 1
Length: 254, dtype: int64

Optimize with Exhaustive Grid Search -> use only 20% of training pca data to do so

In [16]:
#Transform training small batch data
sm_X_train = pca_batch.transform(small_batch.loc[:, small_batch.columns != 'FireMask'])

In [17]:
from sklearn.model_selection import RandomizedSearchCV 
param_tree = {'max_depth': range(2,10,1), 'eta': [0.1,0.01,0.05], 'n_estimators': range(60, 220, 40)}
estimator_tree = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
)

search_tree = RandomizedSearchCV(
    estimator=estimator_tree ,
    param_distributions=param_tree,
    scoring = 'roc_auc',
    n_jobs = -1,
    verbose=4,
    n_iter=1000
)


In [21]:
%%timeit
search_tree.fit(sm_X_train, small_batch['FireMask'])
print('Best Score: ', search_tree.best_score_) 
print('Best Params: ', search_tree.best_params_) 

Fitting 5 folds for each of 96 candidates, totalling 480 fits


ValueError: 
All the 480 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 2851986 2851987 2851988], got [-1.00000000e+00  0.00000000e+00  1.81395424e-12 ...  1.00000000e+00
  1.00000000e+00  1.00000000e+00]

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 2153427 2153428 2153429], got [-1.00000000e+00  0.00000000e+00  1.81395424e-12 ...  1.00000000e+00
  1.00000000e+00  1.00000000e+00]

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 3560336 3560337 3560338], got [-1.00000000e+00  0.00000000e+00  1.81395424e-12 ...  1.00000000e+00
  1.00000000e+00  1.00000000e+00]

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 2834771 2834772 2834773], got [-1.00000000e+00  0.00000000e+00  1.81395424e-12 ...  1.00000000e+00
  1.00000000e+00  1.00000000e+00]

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\annab\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 2846565 2846566 2846567], got [-1.00000000e+00  0.00000000e+00  5.63819899e-11 ...  1.00000000e+00
  1.00000000e+00  1.00000000e+00]


In [20]:
# documentation states that its unreasonable to use sklearns SVM classifier
# it recs SGDClassifier instead due to large sample size

loss = ['hinge', 'log', 'modified_huber', 'squared_hinge',  
'perceptron']
penalty = ['l1', 'l2', 'elasticnet'] 
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] 
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}] 
eta0 = [1, 10, 100] 

param_distributions = dict(loss=loss, 
penalty=penalty, 
alpha=alpha, 
learning_rate=learning_rate, 
class_weight=class_weight, 
eta0=eta0) 

estimator_SGD = SGDClassifier()

grid_search_SGD = RandomizedSearchCV(
    estimator=estimator_SGD ,
    param_distributions=param_distributions,
    scoring = 'roc_auc',
    n_jobs = -1,
    verbose=4
)

In [None]:
%%timeit
grid_search_SGD.fit(sm_X_train, small_batch['FireMask'])
print('Best Score: ', grid_search_SGD.best_score_) 
print('Best Params: ', grid_search_SGD.best_params_) 