In [1]:
import numpy as np
import pandas as pd

from scipy.stats import skew, kurtosis

import os
import gc

#pd.set_option('max_columns', None)

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/Santander_Value_Prediction_Challenge(KAGGLE)/')

In [3]:
def del_id_and_target(X):
    if 'target' in X.columns:
        data = X.drop(['ID', 'target'], axis=1)
    else:
        data = X.drop(['ID'], axis=1)
    return data

In [4]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'))
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/test.csv'))

## Info

In [5]:
train_df.head(3)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [6]:
test_df.head(3)

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)

Train shape: (4459, 4993)
Test shape: (49342, 4992)


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


## Checking missing values

In [10]:
constans_columns = train_df.loc[:, (train_df == train_df.iloc[0]).all()].columns.tolist()
print(len(constans_columns))

256


In [11]:
def find_too_freq_values(threshold=0.98, constant_value=0):
    cols_with_too_freq_values = []
    for column in train_df.columns:
        counts = train_df[column].value_counts()
        try:
            counts[constant_value]
        except KeyError:
            continue
        value_fraction = counts[constant_value] / len(train_df)
        if value_fraction >= threshold:
            cols_with_too_freq_values.append(column)
            
    return cols_with_too_freq_values

In [12]:
%%time
cols_with_too_freq_values = find_too_freq_values()
print(len(cols_with_too_freq_values))

2870
Wall time: 6.72 s


In [13]:
cols_to_remove = list(set(constans_columns) | set(cols_with_too_freq_values))
print(len(cols_to_remove))

2870


In [14]:
cols_to_use = test_df.columns.tolist()[1:] # Remove "ID"
cols_to_use = [x for x in cols_to_use if x not in cols_to_remove]

In [15]:
train_df = train_df[cols_to_use + ['ID', 'target']]
test_df = test_df[cols_to_use + ['ID']]

In [16]:
print('Threshold 0.98 freq. values')
print('Train:', train_df.shape)
print('Test: ', test_df.shape)

Threshold 0.98 freq. values
Train: (4459, 2123)
Test:  (49342, 2122)


In [17]:
train_df.to_csv(os.path.join(PATH_TO_DATA, 'input/train_threshold_098.csv'), index=False)
test_df.to_csv(os.path.join(PATH_TO_DATA, 'input/test_threshold_098.csv'), index=False)

## Aggregations

In [5]:
# READ
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train_threshold_098.csv'))
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/test_threshold_098.csv'))

In [14]:
def aggregate_row(row):
    non_zero_values = row.iloc[row.nonzero()]
    if len(non_zero_values.value_counts())>1:
        aggs = {'non_zero_min': non_zero_values.min(),
                'non_zero_max': non_zero_values.max(),
                'non_zero_mean': non_zero_values.mean(),
                'non_zero_median': non_zero_values.median(),
                'non_zero_sum': non_zero_values.sum(),
                'non_zero_std': non_zero_values.std(),
                'non_zero_skewness': skew(non_zero_values),
                'non_zero_kurtosis': kurtosis(non_zero_values),
                'non_zero_q1': np.percentile(non_zero_values, q=25),
                'non_zero_q3': np.percentile(non_zero_values, q=75),
                'non_zero_log_min': np.log1p(non_zero_values.astype('float64')).min(),
                'non_zero_log_max': np.log1p(non_zero_values.astype('float64')).max(),
                'non_zero_log_mean': np.log1p(non_zero_values.astype('float64')).mean(),
                'non_zero_log_median': np.log1p(non_zero_values.astype('float64')).median(),
                'non_zero_log_sum': np.log1p(non_zero_values.astype('float64')).sum(),
                'non_zero_log_std': np.log1p(non_zero_values.astype('float64')).std(),
                'non_zero_log_skewness': skew(np.log1p(non_zero_values.astype('float64'))),
                'non_zero_log_kurtosis':  kurtosis(np.log1p(non_zero_values.astype('float64'))) ,
                'non_zero_log_q1': np.percentile(np.log1p(non_zero_values.astype('float64')), q=25),
                'non_zero_log_q3': np.percentile(np.log1p(non_zero_values.astype('float64')), q=75),
                'non_zero_count': non_zero_values.count(),
                'non_zero_fraction': non_zero_values.count() / row.count()
                }
    else: 
        aggs = {'non_zero_min': np.nan,
                'non_zero_max': np.nan,
                'non_zero_mean': np.nan,
                'non_zero_median': np.nan,
                'non_zero_sum': np.nan,
                'non_zero_std': np.nan, 
                'non_zero_skewness': np.nan,
                'non_zero_kurtosis': np.nan,
                'non_zero_q1': np.nan, 
                'non_zero_q3': np.nan,
                'non_zero_log_min': np.nan,
                'non_zero_log_max': np.nan, 
                'non_zero_log_mean': np.nan, 
                'non_zero_log_median': np.nan,
                'non_zero_log_sum': np.nan,
                'non_zero_log_std': np.nan, 
                'non_zero_log_skewness': np.nan,
                'non_zero_log_kurtosis': np.nan, 
                'non_zero_log_q1': np.nan,
                'non_zero_log_q3': np.nan,
                'non_zero_count': 0,
                'non_zero_fraction': 0
                }
            
    return pd.Series(aggs)

In [16]:
%%time
train_df_with_row_statistic = del_id_and_target(train_df).apply(aggregate_row, axis=1)
test_df_with_row_statistic = del_id_and_target(test_df).apply(aggregate_row, axis=1)

Wall time: 6min 53s


In [17]:
def dummies_missing(X, missing_value=0):
    missing_mask = np.where(X.values == missing_value, True, False)
    missing_columns = [f'{col}_is_missing' for col in X.columns]
    X_is_missing = pd.DataFrame(missing_mask.astype(int), columns=missing_columns)
    X = X.join(X_is_missing)

    return X

In [18]:
%%time
train_df_binarized = dummies_missing(del_id_and_target(train_df))
test_df_binarized = dummies_missing(del_id_and_target(test_df))

Wall time: 3.03 s


In [19]:
# JOINING Datasets
train_df_with_row_statistic = train_df_with_row_statistic.join(train_df_binarized)
test_df_with_row_statistic = test_df_with_row_statistic.join(test_df_binarized)

In [23]:
train_df_with_row_statistic.to_csv(os.path.join(PATH_TO_DATA, 'train_with_row_statistic_and_bin_thresh098.csv'), index=False)
test_df_with_row_statistic.to_csv(os.path.join(PATH_TO_DATA, 'test_with_row_statistic_and_bin_thresh098.csv'), index=False)

## Create space reduction features

In [18]:
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection

In [19]:
feat_extractors_dict = {
    'pca': PCA(n_components=500, random_state=44),
    'tsvd': TruncatedSVD(n_components=50, n_iter=10, random_state=44),
    'fa': FactorAnalysis(n_components=50, random_state=44),
    'srp': SparseRandomProjection(n_components=50, random_state=44),
    'grp': GaussianRandomProjection(n_components=50, eps=0.1, random_state=44)
}

In [20]:
def create_dim_reduction_feats(df, train=False):
    full_X_space_reduction = []
    all_cols = []
    
    for k, v in feat_extractors_dict.items():
        
        if train:
            print(f'Process train {k}')
            X_space_reduction = v.fit_transform(df)
        else:
            print(f'Process {k}')
            X_space_reduction = v.transform(df)
            
        n_components = X_space_reduction.shape[1]
        cols = [str(k) + f'{i}' for i in range(n_components)]
        all_cols += cols
            
        if len(full_X_space_reduction) == 0:
            full_X_space_reduction = X_space_reduction
        else:
            full_X_space_reduction = np.hstack((full_X_space_reduction, X_space_reduction))
    
    new_df = pd.DataFrame(full_X_space_reduction, columns=all_cols, index=df.index)           
    return new_df

In [21]:
%%time
train_space_reduction = create_dim_reduction_feats(del_id_and_target(train_df), train=True)
test_space_reduction = create_dim_reduction_feats(del_id_and_target(test_df))

Process train pca
Process train tsvd
Process train fa
Process train srp
Process train grp
Process pca
Process tsvd
Process fa
Process srp
Process grp
Wall time: 19.7 s


In [22]:
train_space_reduction.head(3)

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,grp40,grp41,grp42,grp43,grp44,grp45,grp46,grp47,grp48,grp49
0,-15086030.0,-573018.614867,57685.746183,-650521.789298,-421801.577139,-357791.234921,491722.914574,700494.72898,504996.197311,-4016240.0,...,-24603600.0,-2010908.0,27966130.0,1743904.0,-10910640.0,2242675.0,-24756860.0,46852320.0,-11395060.0,6888125.0
1,-21319310.0,-730451.349052,42760.689456,-710564.446199,-469393.988729,-347590.673528,412821.639203,762472.641524,499498.747433,-3775476.0,...,1486469.0,13245930.0,12634040.0,7633258.0,589583.7,-2929417.0,15655050.0,3879774.0,-719187.1,-2446653.0
2,-28913960.0,-786310.308309,10106.571984,-809295.515844,-468074.317391,-355196.826444,491272.337027,748926.912399,503339.329014,-3878444.0,...,-82194.37,-2596314.0,-503800.3,-5997508.0,-1918620.0,-3372705.0,-155501.3,157357.9,-2900414.0,-855391.2


In [23]:
test_space_reduction.head(3)

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,grp40,grp41,grp42,grp43,grp44,grp45,grp46,grp47,grp48,grp49
0,-20373030.0,-879010.0,-357441.425457,-531578.7,791866.457126,-351550.7,95661.115141,523455.91035,446759.612164,-3382357.0,...,222189800.0,-112467900.0,-71813610.0,-158569400.0,105246800.0,-44597390.0,278258300.0,-124445300.0,30989190.0,88404860.0
1,-30846240.0,-773947.7,3205.690104,-795767.5,-471353.891777,-334012.3,447083.962712,761726.456722,503105.54769,-3822216.0,...,2706779.0,410881.1,-4746959.0,3119373.0,-7279593.0,1191656.0,-3602817.0,6099906.0,2594159.0,-3508032.0
2,-23301570.0,-1096363.0,-21789.269278,-1411854.0,-939358.460252,2307687.0,214006.579221,244931.112407,258829.644561,-3112630.0,...,-15924570.0,-16877040.0,24413890.0,-3193522.0,1910976.0,9466727.0,10827440.0,2659513.0,-1493686.0,15917410.0


In [24]:
train_space_reduction.to_csv(os.path.join(PATH_TO_DATA, 'train_space_reduction_50comp.csv'), index=False)
test_space_reduction.to_csv(os.path.join(PATH_TO_DATA, 'test_space_reduction_50comp.csv'), index=False)