# Santander Value Prediction Challenge
   Predict the value of transactions for potential customers.

# Loading libraries and Dataset

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,KFold,train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import gc
%matplotlib inline
from copy import deepcopy
try:
    import cPickle as pickle
except:
    import pickle



In [2]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
    c_min = df_col.min()
    c_max = df_col.max()
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_to_int')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

def int_reduced(df):
    """ iterate through all int columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        df[col] = convert_col_to_proper_int(df[col])
    
    return df

## Thanks You Guillaume Martin for the Awesome Memory Optimizer!
## https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col, type(df[col]), df[col].shape)
        col_type = df[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [4]:
%%time
# Loading from pickle example
gp_clustering_ii_train_store = '../features/santander-gp-clustering-ii.trainfeatures.pkl'
if os.path.isfile(gp_clustering_ii_train_store):
    print("loading data from pickle file", gp_clustering_ii_train_store)
    with open(os.path.abspath(gp_clustering_ii_train_store), 'rb') as f:
        gp_clustering_ii_train  = pickle.load(f, encoding='bytes')
        print('gp_clustering_ii_train:', type(gp_clustering_ii_train), gp_clustering_ii_train.shape)

gp_clustering_ii_test_store = '../features/santander-gp-clustering-ii.testfeatures.pkl'
if os.path.isfile(gp_clustering_ii_test_store):
    print("loading data from pickle file", gp_clustering_ii_test_store)
    with open(os.path.abspath(gp_clustering_ii_test_store), 'rb') as f:
        gp_clustering_ii_test  = pickle.load(f, encoding='bytes')
        print('gp_clustering_ii_test:', type(gp_clustering_ii_test), gp_clustering_ii_test.shape)

gp_clustering_ii = int_reduced(float_reduced(float_to_int(
    pd.concat([pd.DataFrame(gp_clustering_ii_train).add_prefix('gp_clustering_ii_'),
               pd.DataFrame(gp_clustering_ii_test).add_prefix('gp_clustering_ii_')
              ],axis=0))))
del gp_clustering_ii_train, gp_clustering_ii_test
gc.collect()

gp_clustering_ii.info()


loading data from pickle file ../features/santander-gp-clustering-ii.trainfeatures.pkl
gp_clustering_ii_train: <class 'numpy.ndarray'> (4459, 276)
loading data from pickle file ../features/santander-gp-clustering-ii.testfeatures.pkl
gp_clustering_ii_test: <class 'numpy.ndarray'> (49342, 276)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 276 entries, gp_clustering_ii_0 to gp_clustering_ii_275
dtypes: float16(4), float32(270), float64(2)
memory usage: 57.1 MB
Wall time: 14.9 s


In [5]:
%%time
df_with_row_statistic = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/df_with_row_statistic_train.csv', na_values= 'missing').fillna(0),
               pd.read_csv('../features/df_with_row_statistic_test.csv', na_values= 'missing').fillna(0)
              ],axis=0)))
df_with_row_statistic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Data columns (total 24 columns):
non_zero_count           53801 non-null uint16
non_zero_fraction        53801 non-null float16
non_zero_kurtosis        53801 non-null float64
non_zero_log_kurtosis    53801 non-null float64
non_zero_log_max         53801 non-null float64
non_zero_log_mean        53801 non-null float64
non_zero_log_median      53801 non-null float64
non_zero_log_min         53801 non-null float64
non_zero_log_q1          53801 non-null float64
non_zero_log_q3          53801 non-null float64
non_zero_log_skewness    53801 non-null float64
non_zero_log_std         53801 non-null float64
non_zero_log_sum         53801 non-null float64
non_zero_max             53801 non-null float64
non_zero_mean            53801 non-null float64
non_zero_median          53801 non-null float64
non_zero_min             53801 non-null float64
non_zero_q1              53801 non-null float64
non_zero_q3              538

In [6]:
%%time
clustering_features = int_reduced(
    pd.concat([pd.read_csv('../features/train_clustering_features.csv'),
               pd.read_csv('../features/test_clustering_features.csv')
              ],axis=0))
clustering_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Data columns (total 1 columns):
kmeans_clusters    53801 non-null uint8
dtypes: uint8(1)
memory usage: 472.9 KB
Wall time: 25 ms


In [7]:
%%time
dim_reduction = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_dim_reduction.csv'),
               pd.read_csv('../features/test_dim_reduction.csv')
              ],axis=0)))
dim_reduction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 430 entries, gauss0 to mean_tsvd
dtypes: float64(430)
memory usage: 176.9 MB
Wall time: 12.1 s


In [16]:
%%time
manif = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_manif.csv').add_prefix('manif_'),
               pd.read_csv('../features/test_manif.csv').add_prefix('manif_')
              ],axis=0)))
manif.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Data columns (total 66 columns):
manif_0     53801 non-null float32
manif_1     53801 non-null float32
manif_2     53801 non-null float32
manif_3     53801 non-null float32
manif_4     53801 non-null float32
manif_5     53801 non-null float32
manif_6     53801 non-null float32
manif_7     53801 non-null float32
manif_8     53801 non-null float32
manif_9     53801 non-null float32
manif_10    53801 non-null float32
manif_11    53801 non-null float32
manif_12    53801 non-null float32
manif_13    53801 non-null float32
manif_14    53801 non-null float32
manif_15    53801 non-null float32
manif_16    53801 non-null float32
manif_17    53801 non-null float32
manif_18    53801 non-null float32
manif_19    53801 non-null float32
manif_20    53801 non-null float32
manif_21    53801 non-null float32
manif_22    53801 non-null float32
manif_23    53801 non-null float32
manif_24    53801 non-null float32
manif_25    5380

In [9]:
%%time
meta_features = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_meta.csv'),
               pd.read_csv('../features/test_meta.csv')
              ],axis=0)))
meta_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Data columns (total 7 columns):
max_man    53801 non-null float64
med_man    53801 non-null float16
min_man    53801 non-null uint8
nzm        53801 non-null float64
nzs        53801 non-null float64
soz        53801 non-null uint16
var_man    53801 non-null float64
dtypes: float16(1), float64(4), uint16(1), uint8(1)
memory usage: 2.3 MB
Wall time: 182 ms


In [10]:
%%time
space_reduction_tresh098 = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_space_reduction_tresh098.csv'),
               pd.read_csv('../features/test_space_reduction_tresh098.csv')
              ],axis=0)))
space_reduction_tresh098.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 700 entries, pca0 to grp49
dtypes: float64(700)
memory usage: 287.7 MB
Wall time: 20.1 s


In [11]:
%%time
tsne = float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_tsne.csv', header=None, names = ['tsne0', 'tsne1']),
               pd.read_csv('../features/test_tsne.csv', header=None, names = ['tsne0', 'tsne1'])
              ],axis=0)))
tsne.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Data columns (total 2 columns):
tsne0    53801 non-null float64
tsne1    53801 non-null float64
dtypes: float64(2)
memory usage: 1.2 MB
Wall time: 82 ms


In [12]:
%%time
row_statistic_and_bin_tresh098 = int_reduced(float_reduced(float_to_int(
    pd.concat([pd.read_csv('../features/train_with_row_statistic_and_bin_tresh098.csv'),
               pd.read_csv('../features/test_with_row_statistic_and_bin_tresh098.csv')
              ],axis=0))))
row_statistic_and_bin_tresh098.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4264 entries, non_zero_min to 9fc776466_is_missing
dtypes: float16(1), float32(2117), float64(24), uint16(1), uint8(2121)
memory usage: 553.8 MB
Wall time: 10min 8s


In [17]:
set(df_with_row_statistic.columns)-set(row_statistic_and_bin_tresh098.columns)

{'unique_values', 'zero_count'}

In [22]:
%%time
new_dataset = pd.concat([row_statistic_and_bin_tresh098,
                         df_with_row_statistic[['unique_values', 'zero_count']],
                         tsne,
                         space_reduction_tresh098,
                         meta_features,
                         manif,
                         dim_reduction[list(set(dim_reduction.columns)-set(space_reduction_tresh098.columns))],
                         clustering_features,
                         gp_clustering_ii], axis=1, verify_integrity=True).fillna(0)
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 5369 entries, non_zero_min to gp_clustering_ii_275
dtypes: float16(6), float32(2453), float64(783), uint16(4), uint8(2123)
memory usage: 935.2 MB
Wall time: 2.67 s


In [32]:
orig_y = pd.read_csv('../input/train.csv', usecols=['target']).astype(np.int32)

In [33]:
# Train and test
train_idx = range(0, len(orig_y))
test_idx = range(len(orig_y), len(new_dataset))

In [35]:
%%time
new_dataset_data_store = '../features/crowded_features_data_store.pkl'
print( "Saving crowded_features data...")
with open(os.path.abspath(new_dataset_data_store), 'wb') as f:
    pickle.dump((new_dataset, orig_y, train_idx, test_idx),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(new_dataset_data_store))

Saving crowded_features data...
Saved to C:\santander-value-prediction-challenge\features\crowded_features_data_store.pkl
Wall time: 5.16 s
