In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import os, sys, shelve

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler, Normalizer, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [None]:
def val_error(model, tested, prediction):
    """
    Look a roc_auc score betwin train and validation data
    
    Parameters
    ----------
    model: object
        Trained estimator
    
    tested: array-like, shape (n_samples, n_features)
        Data, prepaired as tested data
    
    tested: array-like, shape (n_samples)
        Data, prepaired as tested labels
    """
    y_pred = model.predict(tested)
    err = roc_auc_score(prediction, y_pred)
    return err

In [None]:
# data-loader
from zipfile import ZipFile
def loader(path, index_col=False):
    """
    Unpack kaggle zip-data, then return dict of pd.data
    
    Parameters
    ----------
    path: current path to folder with data
        String
    
    index_col: Column to use as the row labels of the DataFrame, either given as string name or column index.  
    If a sequence of int / str is given, a MultiIndex is used.
    Note: index_col=False can be used to force pandas to not use the first column as the index, e.g. when 
    you have a malformed file with delimiters at the end of each line. 
        int, str, sequence of int / str, or False, default None

    """
    data_dict = {}
    for i in os.listdir(path):
        if os.path.splitext(os.path.join(path, i))[1] == ".zip":
            with ZipFile(os.path.join(path, i), 'r') as g:
                file_list = g.namelist()
                for file_name in file_list:
                    if file_name.endswith('.csv'):
                        with g.open(file_name) as h:
                            filename = os.path.splitext(file_name)[0]
                            data_dict[filename] = pd.read_csv(h, index_col=index_col)
        elif os.path.splitext(os.path.join(path, i))[1] == ".csv":
            with open(os.path.join(path, i), 'r') as g:
                filename = os.path.splitext(i)[0]
                data_dict[filename] = pd.read_csv(g, index_col=index_col)            
    return data_dict

In [None]:
data = loader(os.path.realpath('../input'))
data.keys()

In [None]:
df_train_x, df_train_y, df_test_x, df_test_y = data.values()
del data

In [None]:
# Dump loaded and prepared data
with shelve.open(os.path.realpath('../kernels/loaded_data')) as s:
    s["df_train_x"] = df_train_x
    s["df_test_x"] = df_test_x
    s["df_train_y"] = df_train_y
    s["df_test_y"] = df_test_y

In [None]:
# preprocessing pipline
pipePre = Pipeline([
    ('simpleimputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standardscaler', StandardScaler()),
    ('normalizer', Normalizer())
     ])

In [None]:
df_train_x = pipePre.fit_transform(df_train_x)
df_test_x = pipePre.fit_transform(df_test_x)
del df_train_x
del df_test_x

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size = 0.25, random_state=42)
N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 
print(N_train, N_test)