In [1]:
from tpot import TPOTClassifier

import os
from tqdm import tqdm_notebook as tqdm

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import matplotlib.patches as patches
import seaborn as sns
from pylab import rcParams

%matplotlib inline 
plt.style.use('seaborn')
sns.set(style='whitegrid',color_codes=True)

# classifiaction 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import catboost as ctb

# for classification
from sklearn.metrics import accuracy_score

# model selection
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Hp optimization imports
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import mlflow

import re
import eli5
import gc
import random    
import math
import psutil
import pickle
import datetime
from time import time

# save/load models
from joblib import dump
from joblib import load

import timeit 


  init_args = inspect.getargspec(class_.__init__)
  return attr.s(class_, these=these, init=False, slots=True, **attrs_kwargs)  # type: ignore
Using TensorFlow backend.
  class HeadersDict(collections.MutableMapping):


In [2]:
root = "../../data/raw/Gamma_Log_Facies_Type_Prediction/"
models_root = "../../models/Gamma_Log_Facies_Type_Prediction/"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option('max_columns', 150)
# rcParams['figure.figsize'] = 16,8

In [3]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
%%time
train_df = pd.read_csv(root + "Train_File.csv")
test_df = pd.read_csv(root + "Test_File.csv")
submit_df = pd.read_csv(root + "Submission_File.csv")

CPU times: user 2.81 s, sys: 458 ms, total: 3.27 s
Wall time: 3.32 s


In [5]:
def create_lags(df):
    for i in range(0, 25):
        df["lag_forward_{}".format(i)] = df.GR.shift(i)
        df["lag_backward_{}".format(i)] = df.GR.shift(-i)
    return df

In [6]:
train_df_ts = train_df[train_df["well_id"] < 100]
train_df_ts = train_df_ts.groupby("well_id").apply(create_lags)
train_df_ts = train_df_ts.fillna(0)

valid_df_ts = train_df[train_df["well_id"].isin(list(range(100,120)))]
valid_df_ts = valid_df_ts.groupby("well_id").apply(create_lags)
valid_df_ts = valid_df_ts.fillna(0)


In [7]:
X_train, y_train, X_test, y_test = train_df_ts.drop(["label"], axis=1), train_df_ts["label"], \
            valid_df_ts.drop(["label"], axis=1), valid_df_ts["label"]

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((22000, 53), (22000,), (5500, 53), (5500,))

In [9]:
%%time

tpot = TPOTClassifier(
    generations=10, 
    population_size=50, 
    verbosity=3, 
    scoring="balanced_accuracy",
    periodic_checkpoint_folder="tpot_report",
    cv=3,
    n_jobs=-1,
    random_state=RANDOM_STATE, 
    max_eval_time_mins=10,
)

tpot.fit(X_train, y_train)

scores.append(tpot.score(X_test, y_test))
tpot.export(f'tpot_exported_pipeline_{i}.py')
print('Scores:', tpot.score(X_test, y_test))   
print('Winning pipelines:', tpot.fitted_pipeline_)RANDOM_STATE

31 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=550, style=ProgressStyle(descript…

Skipped pipeline #20 due to time out. Continuing to the next pipeline.
Skipped pipeline #31 due to time out. Continuing to the next pipeline.
Created new folder to save periodic pipeline: tpot_report
Saving periodic pipeline from pareto front to tpot_report/pipeline_gen_1_idx_0_2019.12.05_10-58-24.py
Saving periodic pipeline from pareto front to tpot_report/pipeline_gen_1_idx_1_2019.12.05_10-58-24.py
_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['row_id', 'well_id', 'GR', 'lag_forward_0', 'lag_backward_0', 'lag_forward_1', 'lag_backward_1', 'lag_forward_2', 'lag_backward_2', 'lag_forward_3', 'lag_backward_3', 'lag_forward_4', 'lag_backward_4', 'lag_forward_5', 'lag_backward_5', 'lag_forward_6', 'lag_backward_6', 'lag_forward_7', 'lag_backward_7', 'lag_forward_8', 'lag_backward_8', 'lag_forward_9', 'lag_backward_9', 'lag_forward_10', 'lag_backward_10', 'lag_forward_11', 'lag_backward_11', 'lag_forward_12', 'lag_backward_12', 'lag_forward_13', 'lag_b

NameError: name 'scores' is not defined

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tpot.export_utils import set_param_recursive

In [3]:
%%time

root = "../../data/raw/Gamma_Log_Facies_Type_Prediction/"

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
train_df = pd.read_csv(root + "Train_File.csv")
test_df = pd.read_csv(root + "Test_File.csv")
submit_df = pd.read_csv(root + "Submission_File.csv")


def create_lags(df):
    for i in range(0, 25):
        df["lag_forward_{}".format(i)] = df.GR.shift(i)
        df["lag_backward_{}".format(i)] = df.GR.shift(-i)
    return df


train = train_df
train = train.groupby("well_id").apply(create_lags)
train = train.fillna(0)

test = test_df[["row_id", "well_id", "GR"]]
test = test.groupby("well_id").apply(create_lags)
test = test.fillna(0)

X_train, y_train = train.drop("label", axis=1), train["label"]
X_test = test

CPU times: user 3min 16s, sys: 9.56 s, total: 3min 26s
Wall time: 3min 27s


In [4]:
X_train.shape, y_train.shape, X_test.shape

((4400000, 53), (4400000,), (2200000, 53))

In [None]:
%%time

# Average CV score on the training set was: 0.8716139282013874
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(
        bootstrap=False, 
        criterion="gini", 
        max_features=0.4, 
        min_samples_leaf=3, 
        min_samples_split=10, 
        n_estimators=500,
        n_jobs=-1
    )),
    RandomForestClassifier(
        bootstrap=False, 
        criterion="gini", 
        max_features=0.25, 
        min_samples_leaf=3, 
        min_samples_split=11, 
        n_estimators=500,
        n_jobs=-1
    )
)

# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)

In [None]:
model_file = models_root + "tpot_pipeline.pkl"
dump(exported_pipeline, model_file)
# loaded_model = load(model_file)

In [None]:
%%time

results = exported_pipeline.predict(X_test)

In [None]:
submit_df["label"] = results
submit_df.to_csv(root+"submission.csv", index=False)
submit_df.head(20)

In [None]:
# Количество классов для предсказания
g = submit_df["label"].value_counts()
g = g.sort_index().T
plt.bar(g.index, g.values)

In [5]:
%%time

# Average CV score on the training set was: 0.87030022261145
exported_pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=14, p=1, weights="distance", n_jobs=-1),
    verbose=True,
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)


exported_pipeline.fit(X_train, y_train)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   9.7s
[Pipeline]  (step 2 of 2) Processing kneighborsclassifier, total= 1.2min
CPU times: user 1min 16s, sys: 4.9 s, total: 1min 21s
Wall time: 1min 21s


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=-1, n_neighbors=14, p=1,
                                      weights='distance'))],
         verbose=True)

In [6]:
%%time

results = exported_pipeline.predict(X_test)

KeyboardInterrupt: 

In [7]:
submit_df["label"] = results
submit_df.to_csv(root+"submission.csv", index=False)
submit_df.head(20)

NameError: name 'results' is not defined

In [None]:
# Количество классов для предсказания
g = submit_df["label"].value_counts()
g = g.sort_index().T
plt.bar(g.index, g.values)

In [None]:
model_file = models_root + "tpot_knn_pipeline.pkl"
dump(exported_pipeline, model_file)
# loaded_model = load(model_file)