In [1]:
from tpot import TPOTClassifier

import os
from tqdm import tqdm_notebook as tqdm

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import matplotlib.patches as patches
import seaborn as sns
from pylab import rcParams

%matplotlib inline 
plt.style.use('seaborn')
sns.set(style='whitegrid',color_codes=True)

# classifiaction 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import catboost as ctb

# for classification
from sklearn.metrics import accuracy_score

# model selection
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Hp optimization imports
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import mlflow

import re
import eli5
import gc
import random    
import math
import psutil
import pickle
import datetime
from time import time

# save/load models
from joblib import dump
from joblib import load

import timeit 
from sklearn.preprocessing import StandardScaler

  init_args = inspect.getargspec(class_.__init__)
  return attr.s(class_, these=these, init=False, slots=True, **attrs_kwargs)  # type: ignore
Using TensorFlow backend.
  class HeadersDict(collections.MutableMapping):


In [2]:
root = "../../data/raw/Gamma_Log_Facies_Type_Prediction/"
models_root = "../../models/Gamma_Log_Facies_Type_Prediction/"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option('max_columns', 150)
# rcParams['figure.figsize'] = 16,8

In [3]:
%%time
full_train_df = pd.read_csv(root + "Train_File.csv")
full_test_df = pd.read_csv(root + "Test_File.csv")
submit_df = pd.read_csv(root + "Submission_File.csv")


CPU times: user 2.82 s, sys: 400 ms, total: 3.22 s
Wall time: 3.25 s


In [None]:
def create_lags(df):
    for i in range(0, 25):
        df["lag_forward_{}".format(i)] = df.GR.shift(i)
        df["lag_backward_{}".format(i)] = df.GR.shift(-i)
    return df

In [19]:
train_df_ts = full_train_df[full_train_df["well_id"] < 100]
valid_df_ts = full_train_df[full_train_df["well_id"].isin(list(range(100,120)))]

In [6]:
train_df_ts.head()

Unnamed: 0,row_id,well_id,GR,label
0,0,0,143.51,0
1,1,0,112.790928,0
2,2,0,123.531856,0
3,3,0,111.692784,0
4,4,0,123.613712,0


In [13]:
width = 3
shifted = train_df_ts.GR.shift(width - 1)
window = shifted.rolling(window=width)
dataframe = pd.concat([window.min(), window.mean(), window.max(), shifted], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
dataframe = pd.concat([dataframe, train_df_ts])
print(dataframe.head(10))

   GR  label         max        mean         min  row_id         t+1  well_id
0 NaN    NaN         NaN         NaN         NaN     NaN         NaN      NaN
1 NaN    NaN         NaN         NaN         NaN     NaN         NaN      NaN
2 NaN    NaN         NaN         NaN         NaN     NaN  143.510000      NaN
3 NaN    NaN         NaN         NaN         NaN     NaN  112.790928      NaN
4 NaN    NaN  143.510000  126.610928  112.790928     NaN  123.531856      NaN
5 NaN    NaN  123.531856  116.005190  111.692784     NaN  111.692784      NaN
6 NaN    NaN  123.613712  119.612784  111.692784     NaN  123.613712      NaN
7 NaN    NaN  123.613712  118.573712  111.692784     NaN  120.414641      NaN
8 NaN    NaN  123.613712  122.391307  120.414641     NaN  123.145569      NaN
9 NaN    NaN  123.145569  119.258902  114.216497     NaN  114.216497      NaN


In [None]:
train_df_ts.head()

In [14]:
window

Rolling [window=3,center=False,axis=0]

In [22]:
window = train_df_ts.expanding()
dataframe = pd.concat([window.min(), window.mean(), window.max(), train_df_ts.shift(-1)], axis=1)
# dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

   row_id  well_id          GR  label  row_id  well_id          GR  label  \
0     0.0      0.0  143.510000    0.0     0.0      0.0  143.510000    0.0   
1     0.0      0.0  112.790928    0.0     0.5      0.0  128.150464    0.0   
2     0.0      0.0  112.790928    0.0     1.0      0.0  126.610928    0.0   
3     0.0      0.0  111.692784    0.0     1.5      0.0  122.881392    0.0   
4     0.0      0.0  111.692784    0.0     2.0      0.0  123.027856    0.0   

   row_id  well_id      GR  label  row_id  well_id          GR  label  
0     0.0      0.0  143.51    0.0     1.0      0.0  112.790928    0.0  
1     1.0      0.0  143.51    0.0     2.0      0.0  123.531856    0.0  
2     2.0      0.0  143.51    0.0     3.0      0.0  111.692784    0.0  
3     3.0      0.0  143.51    0.0     4.0      0.0  123.613712    0.0  
4     4.0      0.0  143.51    0.0     5.0      0.0  120.414641    0.0  


In [None]:
train_df_ts = train_df_ts.groupby("well_id").apply(create_lags)
train_df_ts = train_df_ts.fillna(0)

valid_df_ts = valid_df_ts.groupby("well_id").apply(create_lags)
valid_df_ts = valid_df_ts.fillna(0)

In [None]:
X_train, y_train, X_test, y_test = train_df_ts.drop(["label"], axis=1), train_df_ts["label"], \
            valid_df_ts.drop(["label"], axis=1), valid_df_ts["label"]

In [None]:
dataframe = concat([temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1)
dataframe.columns = ['t-3', 't-2', 't-1', 't+1']

In [None]:
mlflow.set_experiment("xgboost_cls_feature_selecting")

class HyperoptHPOptimizer:
    
    def __init__(self, hyperparameters_space, max_evals):
        self.trials = Trials()
        self.max_evals = max_evals
        self.hyperparameters_space = hyperparameters_space
        self.skf = StratifiedKFold(n_splits=3, shuffle=False, random_state=RANDOM_STATE)
    
    def get_loss(self, hyperparameters):
        # MLflow will track and save hyperparameters, loss, and scores. 
        with mlflow.start_run(run_name='hyperopt_param'):
            params = {
                'min_child_weight': 8,
                'gamma': 3,
                'subsample': 1,
                'colsample_bytree': 0.6,
                'eta': 0.3,
                'max_depth':  4,
                'random_state': RANDOM_STATE, 
                'verbosity': 1, 
                'n_jobs': -1, 
                'n_estimators': 10, 
                'learning_rate': 0.1, 
            }
            cols = [col for col, is_use in hyperparameters.items() if is_use == 1]
            for k, v in hyperparameters.items():
                mlflow.log_param(k, v)
            model = xgb.XGBClassifier(**params)
            model.fit(X_train[cols], y_train)
            y_pred = model.predict(X_test[cols])
            loss = accuracy_score(y_test, y_pred)
            # Log the various losses and metrics (on train and validation)
            mlflow.log_metric("accuracy", loss)
            # Use the last validation loss from the history object to optimize
            return {
                'loss': -loss, 
                'status': STATUS_OK,
                'eval_time': time()
            }

    def optimize(self):
        """
        This is the optimization function that given a space of 
        hyperparameters and a scoring function, finds the best hyperparameters.
        """
        # Use the fmin function from Hyperopt to find the best hyperparameters
        # Here we use the tree-parzen estimator method. 
        best = fmin(self.get_loss, self.hyperparameters_space, algo=tpe.suggest, 
                    trials=self.trials,  max_evals=self.max_evals)
        return best



MAX_EVALS = 200

HYPERPARAMETERS_SPACE = {col: hp.choice(col, [0, 1]) for col in X_train.columns.values}

hp_optimizer = HyperoptHPOptimizer(hyperparameters_space=HYPERPARAMETERS_SPACE, max_evals=MAX_EVALS)
optimal_hyperparameters = hp_optimizer.optimize()
print(optimal_hyperparameters)