# Using Data From Hard-Disck Memory
Here some functions that uses data from memory to create the ML model

#### TODO:
- try parallel computing

In [None]:
#  Data Manipulation
import os
import numpy as np
import pandas as pd
pd.options.display.precision = 15
np.seterr(divide='ignore', invalid='ignore')

#  Machine learning
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

# Other functions
from tqdm import tqdm

#  Our functions
import sys
sys.path.append("../..")
from earthquakes.helpers import Caller, create_feature_dataset_source
from earthquakes.engineering import FeatureComputer
from earthquakes.modeling import predict_on_test
from common.utils import save_object, load_object

#  Data directories
data_dir = "../data"
save_dir = "../data/chunk_signal"
save_class = "../data/classes"

## 0. Set up the data
First, the data must be stored in memory in chunks. In order to do so, __we have to run this block of code only ones__. Unless you want to change parameters such as size. In this case, you have to run it again. Be aware that you have to delete the files on the destiny folder.

In [None]:
# train = pd.read_csv(os.path.join(data_dir, "train.csv"),
#                     dtype={"acoustic_data": np.int16, "time_to_failure": np.float64})
train = pd.read_pickle(os.path.join(data_dir, "train.pickle"))
caller_cls = Caller(save_dir=save_dir, size=150000)
caller_cls.save_data(train)
save_object(save_path=os.path.join(save_class, "caller_cls.pkl"), object_ = caller_cls)
train = None

## 1. Lest get the events
Here a very simple implementation to illustrate how this method can be used getting the index values when there is an earthquake. Before we were having memory errors.


In [None]:
caller_cls = load_object(os.path.join(save_class, "caller_cls.pkl"))

To ilustrate the most basic funtionality of this class, here you can set any initial id and window size. You will get back only the information that you require.

In [None]:
caller_cls.get_intervals(i_init=60, window_size=150000)

We can iterate over all the signal to get the events (earthquakes). This is done in only two minutes! Which is remarkable considering the size of the dataset. The events positions are:
- [5656573,
 50085877,
 104677355,
 138772452,
 187641819,
 218652629,
 245829584,
 307838916,
 338276286,
 375377847,
 419368879,
 461811622,
 495800224,
 528777114,
 585568143,
 621985672]

In [None]:
def get_events():
    """Funtion to get events (earthquakes). 
    Because there is not 0 time, an event is defined
    when the difference between two observatiosn in time
    is positive.
    """
    events = []
    for idx in tqdm(caller_cls.index_list[:-1]):
        try:
            delta = caller_cls.get_intervals(i_init = idx, window_size=150000)['time_to_failure'].diff()
        except:
            print(idx)
        delta = delta[delta > 0]
        events.append(list(delta.index -1))
        delta = None
    return [x[0] for x in events if x]

events_id = get_events()

## 2. Geting the data per cycle (information between events):
It makes sense to train and test our models on different cycles. In order to do so, here we create the datasets for every cycle independently. I skip the cycle that starts from 0 because there is no guarantee that it is a complete cycle. Same issue with the last interval. 

In [None]:
caller_cls = load_object(os.path.join(save_class, "caller_cls.pkl"))
q = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
computer = FeatureComputer(quantiles=q, abs_quantiles=q)
events = [5656573, 50085877, 104677355, 138772452, 187641819, 218652629, 245829584,
          307838916, 338276286, 375377847, 419368879, 461811622, 495800224, 528777114,
          585568143, 621985672]
tuples = [(x, y - 150000) for x, y in zip(events, events[1:])]

In [None]:
for i_0, i_n in tuples:
    name = str(i_0) + '_' + str(i_n)
    new_data = create_feature_dataset_source(caller_cls,
                                             feature_computer=computer,
                                             step=5000,
                                             stft=True, 
                                             events_id=(i_0, i_n))
    new_data.to_pickle(os.path.join(data_dir, "cycle_s5000_{}.pkl".format(name)))

## 3. Some experiments for fun

In [None]:
data_train = pd.read_pickle(os.path.join(data_dir, "cycle_s5000_5656573_49935877.pkl"))
data_val = pd.read_pickle(os.path.join(data_dir, "cycle_s5000_138772452_187491819.pkl"))
X_train, X_valid, y_train, y_valid  = data_train, data_val, data_train["time_to_failure"], data_val["time_to_failure"]
del X_train["time_to_failure"]
del X_valid["time_to_failure"]

In [None]:
pd.Series(X_train["maximum"].values[-2000:]).plot()
pd.Series(X_valid["maximum"].values[-2000:]).plot() 

### XGBoost

In [None]:
params = {'num_leaves': 54,
          'min_data_in_leaf': 79,
          'objective': 'huber',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting": "gbdt",
          "bagging_freq": 5,
          "bagging_fraction": 0.8126672064208567,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1302650970728192,
          'reg_lambda': 0.3603427518866501
         }

model = XGBRegressor(num_boost_round=20000,
                     early_stopping_rounds=200,
                     verbose_eval=500,
                     params=params,
                     n_jobs=-1)

# model = LinearRegression() 

### Train the model 

In [None]:
model.fit(X_train, y_train)
scores = -1 * cross_val_score(model, X_valid, y_valid, cv=5, scoring='neg_mean_absolute_error')
np.mean(scores)

In [None]:
q = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
computer = FeatureComputer(quantiles=q, abs_quantiles=q)
submission = predict_on_test(model=model,
                             feature_computer=computer,
                             stft_feature_computer=computer,
                             ycol="time_to_failure",
                             stft=True,
                             data_dir=data_dir)

In [None]:
submission.to_csv(os.path.join(data_dir, "submissions", "submission_full_cycle.csv"), index=False)