# LANL Earthquake Prediction Challenge

<img src="img/LANL-Title-Image.jpg" align=left width="600"/>

# 1. Problem Statement

### This project aims at making time untill earthquake predictions based off of simulated seismic signal data.

# 2. Data Analysis

## 2.1 Import Dependencies

In [1]:
# import data analysis tools
import pandas as pd
import numpy as np

# plotting
%matplotlib inline
from matplotlib import pyplot as plt

# project tools
import LANL_Tools

## 2.2 Load Training and Testing Data

In [2]:
# reset directory and load data
import os
os.chdir("/")
%time df_train = pd.read_csv('Users/gregeales/Desktop/LANL-Data/train.csv', dtype = {'acoustic_data': np.int16, 'time_to_failure': np.float32}) # float32 is enough :)

CPU times: user 2min 37s, sys: 24 s, total: 3min 1s
Wall time: 3min 6s


## 2.3 Statistical Analysis

### 2.3.1 Acoustic Data

In [6]:
pd.set_option("display.precision", 7)
df_train.acoustic_data.describe()

count    6.2914548e+08
mean     4.5194676e+00
std      1.0735707e+01
min     -5.5150000e+03
25%      2.0000000e+00
50%      5.0000000e+00
75%      7.0000000e+00
max      5.4440000e+03
Name: acoustic_data, dtype: float64

In [None]:
train_sample = df_train.sample(frac=0.001)
plt.figure(figsize=(10,5))
plt.title("Acoustic data distribution")
ax = plt.plot(train_sample.acoustic_data, label='Train (1% sample)')

### 2.3.2 Time to Failure

In [None]:
pd.set_option("display.precision", 7)
df_train.time_to_failure.describe()

## 2.4 Feature Analysis

### 2.4.1 Import Dependencies

In [7]:
# import LANL Tools
from LANL_Tools.feature_generator import FeatureGenerator
from LANL_Tools.feature_functions import *
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold

### 2.4.2 Generate Features

In [None]:
training_fg = FeatureGenerator(dtype='train', n_jobs=20, chunk_size=150000)
training_data = training_fg.generate()

test_fg = FeatureGenerator(dtype='test', n_jobs=20, chunk_size=150000)
test_data = test_fg.generate()

X = training_data.drop(['target', 'seg_id'], axis=1)
X_test = test_data.drop(['target', 'seg_id'], axis=1)
test_segs = test_data.seg_id
y = training_data.target

means_dict = {}
for col in X.columns:
    if X[col].isnull().any():
        print(col)
        mean_value = X.loc[X[col] != -np.inf, col].mean()
        X.loc[X[col] == -np.inf, col] = mean_value
        X[col] = X[col].fillna(mean_value)
        means_dict[col] = mean_value
        
for col in X_test.columns:
    if X_test[col].isnull().any():
        X_test.loc[X_test[col] == -np.inf, col] = means_dict[col]
        X_test[col] = X_test[col].fillna(means_dict[col])

### 2.4.3 Define Basic Model

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

### 2.4.4 Train Basic Model and Display Feature Importance

In [None]:
# define parameters
params = {'num_leaves': 128, 'min_data_in_leaf': 79, 'objective': 'gamma', 'max_depth': -1, 'learning_rate': 0.01, 
          "boosting": "gbdt", "bagging_freq": 5, "bagging_fraction": 0.8126672064208567, "bagging_seed": 11,
          "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,
          'feature_fraction': 0.2}

In [None]:
# train basic model
oof_lgb, prediction_lgb, feature_importance = train_model(X, X_test, y, params=params, model_type='lgb',
                                                          plot_feature_importance=True)

### 2.4.5 Graph Prediction Output vs Actual

In [None]:
plt.figure(figsize=(18, 8))
plt.plot(y, color='g', label='y_train')
plt.plot(oof_lgb, color='b', label='lgb')
plt.legend(loc=(1, 0.5));
plt.title('lgb');

### 2.4.6 Save Feature Data Set

In [None]:
# reset directory and load data
import os
os.chdir("/")
X.to_csv('Users/gregeales/Desktop/LANL-Data/train_features.csv', index=False)
X_test.to_csv('Users/gregeales/Desktop/LANL-Data/test_features.csv', index=False)
pd.DataFrame(y).to_csv('Users/gregeales/Desktop/LANL-Data/y.csv', index=False)

# 3. Data Preprocessing

## 3.1 Import Dependencies

In [6]:
# import LANL Tools
from LANL_Tools.feature_generator import FeatureGenerator
from LANL_Tools.feature_functions import *

# import math
import numpy as np

# 4. Model Testing

## 4.1 Loading Data

### 4.1.1 Import Dependencies

In [3]:
import pandas as pd

### 4.1.2 Load Raw Data

### 4.1.3 Load Feature Data

In [5]:
# reset directory and load feature data
import os
os.chdir("/")
path = "Users/gregeales/Desktop/LANL-Data/"
train_features = pd.read_csv(path + 'train_features.csv')
test_features = pd.read_csv(path + 'test_features.csv')
train_features_denoised = pd.read_csv(path + 'train_features_denoised.csv')
test_features_denoised = pd.read_csv(path + 'test_features_denoised.csv')
train_features_denoised.columns = [f'{i}_denoised' for i in train_features_denoised.columns]
test_features_denoised.columns = [f'{i}_denoised' for i in test_features_denoised.columns]
y = pd.read_csv(path + 'y.csv')

In [None]:
# concatinate the data
X = pd.concat([train_features, train_features_denoised], axis=1).drop(['seg_id_denoised', 'target_denoised'], axis=1)
X_test = pd.concat([test_features, test_features_denoised], axis=1).drop(['seg_id_denoised', 'target_denoised'], axis=1)
X = X[:-1]
y = y[:-1]

## 4.2 Reccurent Neural Network

### 4.2.1 Import Dependencies

In [None]:
GPlearn+LGB+XGB

## 4.3 LGBM

### 4.3.1 Import Dependencies

## 4.4 Catboost

### 4.4.1 Import Dependencies

## 4.5 Support Vector Machine

### 4.5.1 Import Dependencies

In [6]:
from sklearn import svm

## 4.6 Model

### 4.6.1 Import Dependencies

## 4.7 LSTM

# 5. Refrences