# LANL Earthquake Prediction Challenge

<img src="img/LANL-Title-Image.jpg" align=left width="600"/>

# 1. Problem Statement

### This project aims at making time untill earthquake predictions based off of simulated seismic signal data.

# 2. Data Analysis

## 2.1 Import Dependencies

In [1]:
# import data analysis tools
import pandas as pd
import numpy as np

# plotting
%matplotlib inline
from matplotlib import pyplot as plt

# project tools
import LANL_Tools

## 2.2 Load Training and Testing Data

In [2]:
# reset directory and load data
import os
os.chdir("/")
%time df_train = pd.read_csv('Users/gregeales/Desktop/LANL-Data/train.csv', dtype = {'acoustic_data': np.int16, 'time_to_failure': np.float32}) # float32 is enough :)

CPU times: user 2min 37s, sys: 24 s, total: 3min 1s
Wall time: 3min 6s


## 2.3 Statistical Analysis

### 2.3.1 Acoustic Data

In [6]:
pd.set_option("display.precision", 7)
df_train.acoustic_data.describe()

count    6.2914548e+08
mean     4.5194676e+00
std      1.0735707e+01
min     -5.5150000e+03
25%      2.0000000e+00
50%      5.0000000e+00
75%      7.0000000e+00
max      5.4440000e+03
Name: acoustic_data, dtype: float64

In [None]:
train_sample = df_train.sample(frac=0.001)
plt.figure(figsize=(10,5))
plt.title("Acoustic data distribution")
ax = plt.plot(train_sample.acoustic_data, label='Train (1% sample)')

### 2.3.2 Time to Failure

In [None]:
pd.set_option("display.precision", 7)
df_train.time_to_failure.describe()

## 2.4 Feature Analysis

### 2.4.1 Import Dependencies

In [7]:
# import LANL Tools
from LANL_Tools.feature_generator import FeatureGenerator
from LANL_Tools.feature_functions import *
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold

### 2.4.2 Generate Features

In [None]:
training_fg = FeatureGenerator(dtype='train', n_jobs=20, chunk_size=150000)
training_data = training_fg.generate()

test_fg = FeatureGenerator(dtype='test', n_jobs=20, chunk_size=150000)
test_data = test_fg.generate()

X = training_data.drop(['target', 'seg_id'], axis=1)
X_test = test_data.drop(['target', 'seg_id'], axis=1)
test_segs = test_data.seg_id
y = training_data.target

means_dict = {}
for col in X.columns:
    if X[col].isnull().any():
        print(col)
        mean_value = X.loc[X[col] != -np.inf, col].mean()
        X.loc[X[col] == -np.inf, col] = mean_value
        X[col] = X[col].fillna(mean_value)
        means_dict[col] = mean_value
        
for col in X_test.columns:
    if X_test[col].isnull().any():
        X_test.loc[X_test[col] == -np.inf, col] = means_dict[col]
        X_test[col] = X_test[col].fillna(means_dict[col])

### 2.4.3 Define Basic Model

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

### 2.4.4 Train Basic Model and Display Feature Importance

In [None]:
# define parameters
params = {'num_leaves': 128, 'min_data_in_leaf': 79, 'objective': 'gamma', 'max_depth': -1, 'learning_rate': 0.01, 
          "boosting": "gbdt", "bagging_freq": 5, "bagging_fraction": 0.8126672064208567, "bagging_seed": 11,
          "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,
          'feature_fraction': 0.2}

In [None]:
# train basic model
oof_lgb, prediction_lgb, feature_importance = train_model(X, X_test, y, params=params, model_type='lgb',
                                                          plot_feature_importance=True)

### 2.4.5 Graph Prediction Output vs Actual

In [None]:
plt.figure(figsize=(18, 8))
plt.plot(y, color='g', label='y_train')
plt.plot(oof_lgb, color='b', label='lgb')
plt.legend(loc=(1, 0.5));
plt.title('lgb');

### 2.4.6 Save Feature Data Set

In [None]:
# reset directory and load data
import os
os.chdir("/")
X.to_csv('Users/gregeales/Desktop/LANL-Data/train_features.csv', index=False)
X_test.to_csv('Users/gregeales/Desktop/LANL-Data/test_features.csv', index=False)
pd.DataFrame(y).to_csv('Users/gregeales/Desktop/LANL-Data/y.csv', index=False)

# 3. Data Preprocessing

## 3.1 Import Dependencies

In [10]:
# import LANL Tools
from LANL_Tools.feature_generator import FeatureGenerator
from LANL_Tools.feature_functions import *

# import libs
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


## 3.2 Load Featurized Data

In [7]:
# reset directory and load feature data
import os
os.chdir("/")
path = "Users/gregeales/Desktop/LANL-Data/"
train_features = pd.read_csv(path + 'train_features.csv')
test_features = pd.read_csv(path + 'test_features.csv')
train_features_denoised = pd.read_csv(path + 'train_features_denoised.csv')
test_features_denoised = pd.read_csv(path + 'test_features_denoised.csv')
train_features_denoised.columns = [f'{i}_denoised' for i in train_features_denoised.columns]
test_features_denoised.columns = [f'{i}_denoised' for i in test_features_denoised.columns]
y = pd.read_csv(path + 'y.csv')

In [8]:
# add denoised and un-denoised data together in case of information loss
X = pd.concat([train_features, train_features_denoised], axis=1).drop(['seg_id_denoised', 'target_denoised'], axis=1)
X_test = pd.concat([test_features, test_features_denoised], axis=1).drop(['seg_id_denoised', 'target_denoised'], axis=1)
# remove last line due to it being less than regular interval
X = X[:-1]
y = y[:-1]

## 3.3 Scale Data 

In [11]:
# generate scaled data
scaler = StandardScaler()
scaler.fit(X)
X_train_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled_holder = pd.DataFrame(scaler.transform(X), columns=X.columns)
X_test_scaled_holder = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [12]:
%%time
n = 10
neigh = NearestNeighbors(n, n_jobs=-1)
neigh.fit(X_train_scaled)

dists, _ = neigh.kneighbors(X_train_scaled, n_neighbors=n)
mean_dist = dists.mean(axis=1)
max_dist = dists.max(axis=1)
min_dist = dists.min(axis=1)

X_train_scaled['mean_dist'] = mean_dist
X_train_scaled['max_dist'] = max_dist
X_train_scaled['min_dist'] = min_dist

test_dists, _ = neigh.kneighbors(X_test_scaled, n_neighbors=n)

test_mean_dist = test_dists.mean(axis=1)
test_max_dist = test_dists.max(axis=1)
test_min_dist = test_dists.min(axis=1)

X_test_scaled['mean_dist'] = test_mean_dist
X_test_scaled['max_dist'] = test_max_dist
X_test_scaled['min_dist'] = test_min_dist

CPU times: user 2min 15s, sys: 507 ms, total: 2min 15s
Wall time: 38.6 s


## 3.4 Save Normalized Feature Data

In [15]:
X_test_scaled.to_csv('Users/gregeales/Desktop/LANL-Data/X_test_scaled.csv', index=False)
X_train_scaled.to_csv('Users/gregeales/Desktop/LANL-Data/X_train_scaled.csv', index=False)

# 4. Model Creation

## 4.1 Loading Data

### 4.1.1 Import Dependencies

In [3]:
import pandas as pd

### 4.1.2 Load Time Series Data

In [23]:
# load time series data for reccurent networks
import os
os.chdir("/")
#float_data = np.load("kaggle/input/lanl-lstm-data/LSTM_Training_Data.npy") 
batch_size = 32
n_features=12
second_earthquake = 50085877

### 4.1.3 Load Normalized Feature Data

In [16]:
# reset directory and load nomralized feature data
import os
os.chdir("/")
path = "Users/gregeales/Desktop/LANL-Data/"
X_train_scaled = pd.read_csv(path + 'X_train_scaled.csv')
X_test_scaled = pd.read_csv(path + 'X_test_scaled.csv')
y = pd.read_csv(path + 'y.csv')

## 4.2 Reccurent Neural Network

### 4.2.1 Import Dependencies

In [20]:
from keras.models import Sequential
from keras.layers import Dense, RNN
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

### 4.2.2 Define and Compile Model

In [24]:
cb = [ModelCheckpoint("model.hdf5", save_best_only=True, period=1, verbose=1, monitor='val_loss')]
model = Sequential()
model.add(RNN(65, return_sequences=True,input_shape=(None, n_features)))
model.add(RNN(50))
model.add(Dense(75, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(1, activation='relu'))
model.summary()
model.compile(optimizer=adam(lr=0.0005), loss="mae")

ValueError: ('`cell` should have a `call` method. The RNN was passed:', 65)

In [None]:
history = model.fit_generator(train_gen, steps_per_epoch=1000, epochs=40, verbose=0, validation_data=valid_gen,
                                              validation_steps=200, callbacks=cb)

## 4.3 LGBM

### 4.3.1 Import Dependencies

## 4.4 Catboost

### 4.4.1 Import Dependencies

## 4.5 Support Vector Machine

### 4.5.1 Import Dependencies

In [6]:
from sklearn import svm

## 4.6 Model

### 4.6.1 Import Dependencies

## 4.7 LSTM

# 5. Refrences