# Example usage of current code

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor

sys.path.append("../..")
from earthquakes.engineering import sequence_generator, FeatureComputer, create_feature_dataset
from earthquakes.modeling import train_and_predict, cv_with_feature_computer, predict_on_test, create_test_dataset

Load data

In [None]:
pd.options.display.precision = 15
data_dir = "../data"

train = pd.read_csv(os.path.join(data_dir, "train.csv"),
                    dtype={"acoustic_data": np.int16, "time_to_failure": np.float64})

# # save as pickle for fasting loading
# train.to_pickle(os.path.join(data_dir, "train.pickle"))
# # and load it
# train = pickle.load(open(os.path.join(data_dir, "train.pickle"), "rb"))

train.head()

Replicate the work in the starter notebook with the functions from the `engineering` and `modeling` modules. Let's use a slightly different model though and some more quantiles.

__Added functionality:__
- The cross validation method now has an option to predict on the test set at every fold by setting `predict_on_test=True`. In that case, the method returns a dataframe with predictions on the test set besides the cross validation scores. We can use this to blending.
- There is the option to `Compute the Short Time Fourier Transform` by setting `stft=True`. From this transformation, the same statistics are calculated as for the usual time interval giving additional information from the signal. Note that there are some parameters of this transformation that we can play with. For further information, please see [the docs](https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.stft.html).
- The available features in FeatureComputer is extended and features can now be calculated over windows within a sequence (subsequences) by setting, e.g., `window=5000` in the FeatureComputer.



### Create submission using the new features (including stft), calculated over 10 subsequences (windows) per sequence.

In [None]:
q = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
big = [100, 200, 500, 1000]
stalta = [(50, 1000), (100, 1500), (500, 5000), (1000, 10000), (5000, 15000), (10000, 25000)]
stalta_window = [(50, 1000), (100, 1500), (500, 5000), (1000, 5000)]
exp_mov_ave = [300, 3000, 10000]
exp_mov_ave_window = [300, 1000, 2000]

computer = FeatureComputer(quantiles=q, abs_quantiles=q, count_abs_big=big, stalta=stalta, stalta_window=stalta_window,
                           exp_mov_ave=exp_mov_ave, exp_mov_ave_window=exp_mov_ave_window, window=15000)
stft_computer = FeatureComputer(quantiles=q, abs_quantiles=q, count_abs_big=big) # no windows, STALTA, and exp_mov_ave for stft

params = {
    "n_estimators": 1000,
    "loss": 'lad',
    "verbose": 1,
}

scores, test_predictions = cv_with_feature_computer(train, GradientBoostingRegressor, computer,
                                                    train_samples=500, val_samples=100,
                                                    predict_test=True, data_dir=data_dir,
                                                    stft=True, stft_feature_computer=stft_computer)

print("Cross validation score: {}".format(np.mean(scores)))

Let's try blending by averaging over the predictions.

In [None]:
submission = test_predictions[["seg_id", "time_to_failure"]].copy()
submission["time_to_failure"] = test_predictions.drop("seg_id", axis=1).mean(axis=1)
submission.head()

In [None]:
submission.to_csv(os.path.join(data_dir, "submissions", "submission.csv"), index=False)

__Leaderboard scores:__
- First version with RF without blending: __1.758__
- Still with basic features (minimum, maximum, mean, median, std, quantiles), GradientBoostingRegressor, and blending: __1.592__
- Using `stft=True` and XGBoost: __1.546__.
- Using new features (PR 128), stft, and GradientBoostingRegressor: __1.544__.
- Using new features (PR 128 and 130), stft, and GradientBoostingRegressor: __1.533__.


### Create a test dataset
We can now create a test dataset, so that we don't compute features on the same data in every fold.

In [None]:
x_test = create_test_dataset(computer, data_dir=data_dir, stft=True, stft_feature_computer=stft_computer)
x_test.head()

And use it in cross validation like so:

```
>>> scores, test_predictions = cv_with_feature_computer(..., test_data=x_test, ...)
```

In [None]:
scores, test_predictions = cv_with_feature_computer(train, GradientBoostingRegressor, computer,
                                                    test_data=x_test, train_samples=500, val_samples=100,
                                                    predict_test=True, data_dir=data_dir,
                                                    stft=True, stft_feature_computer=stft_computer)