# Santander Value Prediction Challenge
   Predict the value of transactions for potential customers.

# Loading libraries and Dataset

In [109]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,KFold,train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import gc
%matplotlib inline
from copy import deepcopy
try:
    import cPickle as pickle
except:
    import pickle



In [2]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
df_train=pd.read_csv('../input/train.csv')
df_test=pd.read_csv('../input/test.csv')

In [36]:
df_train['target'] = df_train['target'].astype(np.int32)

In [37]:
print('Shape of training dataset: ',df_train.shape)
df_train.head()

Shape of training dataset:  (4459, 4737)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [5]:
print('Shape of test dataset: ',df_test.shape)
df_test.head()

Shape of test dataset:  (49342, 4992)


Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


## Checking Missing Values

Defining function to check missing values and percentage of missing values in each column.

In [8]:
def check_missing_data(df):
    total=df.isnull().sum().sort_values(ascending=False)
    percent=((df.isnull().sum()/df.isnull().count())*100).sort_values(ascending=False)
    return pd.concat([total,percent],axis=1,keys=['Total','Percent'])

In [9]:
check_missing_data(df_train).head()

Unnamed: 0,Total,Percent
9fc776466,0,0.0
083640132,0,0.0
26e2c24e3,0,0.0
c3726f249,0,0.0
a682ef110,0,0.0


In [10]:
check_missing_data(df_test).head()

Unnamed: 0,Total,Percent
9fc776466,0,0.0
f0aa40974,0,0.0
c3726f249,0,0.0
a682ef110,0,0.0
b452ba57e,0,0.0


In [11]:
#### Check if there are any NULL values in Train Data
print("Total Train Features with NaN Values = " + str(df_train.columns[df_train.isnull().sum() != 0].size))
if (df_train.columns[df_train.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(df_train.columns[df_train.isnull().sum() != 0])))
    df_train[df_train.columns[df_train.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 0


In [12]:
#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(df_test.columns[df_test.isnull().sum() != 0].size))
if (df_test.columns[df_test.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(df_test.columns[df_test.isnull().sum() != 0])))
    df_test[df_test.columns[df_test.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Test Features with NaN Values = 0


# Checking Unique Value in each column
Column with only one unique value is useless. There for we can drop these columns

In [13]:
df_tmp=pd.DataFrame(df_train.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})
df_tmp.head()

Unnamed: 0,Column_name,num_unique_values
0,d9a8615f3,1
1,4a64e56e7,1
2,34cc56e83,1
3,19122191d,1
4,080540c81,1


In [79]:
df_tmp.tail(10)

Unnamed: 0,Column_name,num_unique_values
4983,20aa07010,714
4984,6eef030c1,721
4985,15ace8c9f,725
4986,fb0f5dbfe,728
4987,9fd594eec,739
4988,eeb9cd3aa,755
4989,f190486d6,765
4990,58e2e02e6,776
4991,target,1413
4992,ID,4459


In [14]:
def col_name_with_n_unique_value(df,n):
    df1=pd.DataFrame(df.nunique().sort_values(),columns=['num_unique_values']).reset_index()
    col_name=list(df1[df1.num_unique_values==1]['index'])
    print('number of columns with only',n,'unique values are: ',len(col_name))
    return col_name

In [15]:
col_to_drop=col_name_with_n_unique_value(df_train,1)

number of columns with only 1 unique values are:  256


In [16]:
# Columns to drop because there is no variation in training set
zero_std_cols = df_train.drop("ID", axis=1).columns[df_train.std() == 0]
print('number of columns with no variation in training set are: ', len(zero_std_cols))

number of columns with no variation in training set are:  256


### Droping unneccessary columns from train and test dataset

In [17]:
df_train.drop(columns=col_to_drop,inplace=True)
df_test.drop(columns=col_to_drop,inplace=True)
print('Shape of train dataset after droping columns: ',df_train.shape)
print('Shape of test dataset after droping columns: ',df_test.shape)

Shape of train dataset after droping columns:  (4459, 4737)
Shape of test dataset after droping columns:  (49342, 4736)


# Additional cleaning

In [18]:
# Get the combined data
total_df = pd.concat([df_train.drop('target', axis=1), df_test], axis=0).drop('ID', axis=1)

In [19]:
%%time
# Removing duplicate columns
# Taken from: https://www.kaggle.com/scirpus/santander-poor-mans-tsne
colsToRemove = []
colsScaned = []
dupList = {}
columns = total_df.columns
for i in range(len(columns)-1):
    v = df_train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, df_train[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
colsToRemove = list(set(colsToRemove))
total_df.drop(colsToRemove, axis=1, inplace=True)
print(f">> Dropped {len(colsToRemove)} duplicate columns")

>> Dropped 5 duplicate columns
Wall time: 5min 44s


In [20]:
gc.collect()
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float64(4730)
memory usage: 1.9 GB


In [49]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
    c_min = df_col.min()
    c_max = df_col.max()
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

## Thanks You Guillaume Martin for the Awesome Memory Optimizer!
## https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col, type(df[col]), df[col].shape)
        col_type = df[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [50]:
%%time
total_df = float_to_int(total_df)
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float32(1325), float64(3405)
memory usage: 1.6 GB
Wall time: 6.78 s


In [51]:
%%time
total_df = float_reduced(total_df)
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float32(4720), float64(10)
memory usage: 973.2 MB
Wall time: 24min 16s


In [61]:
list(total_df.dtypes[(total_df.dtypes == 'float64')].index.values)
# ['0943b38f3',
#  '6cd62da62',
#  'ef139d7ac',
#  'c2dae3a5a',
#  '51c250e53',
#  '1f0a4e1f9',
#  '81e3fed66',
#  '1af4d24fa',
#  'f6c436744',
#  'fe919be32']

['0943b38f3',
 '6cd62da62',
 'ef139d7ac',
 'c2dae3a5a',
 '51c250e53',
 '1f0a4e1f9',
 '81e3fed66',
 '1af4d24fa',
 'f6c436744',
 'fe919be32']

In [80]:
df_tmp2=pd.DataFrame(total_df.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})
df_tmp2.head(10)

Unnamed: 0,Column_name,num_unique_values
0,73a32cd05,171
1,5a1183b8c,171
2,0345f70f2,172
3,770d3a11c,174
4,1b6c8debf,174
5,d0e4129cb,175
6,ae82c2c99,175
7,969d32625,175
8,05d95861b,176
9,47d17d395,177


In [81]:
df_tmp2.tail(10)

Unnamed: 0,Column_name,num_unique_values
4720,1702b5bf0,2554
4721,b43a7cfd5,2572
4722,fb0f5dbfe,2582
4723,6eef030c1,2596
4724,20aa07010,2596
4725,15ace8c9f,2607
4726,58e2e02e6,2623
4727,9fd594eec,2641
4728,eeb9cd3aa,2647
4729,f190486d6,2719


In [82]:
df_tmp2[df_tmp2['num_unique_values']<=255]

Unnamed: 0,Column_name,num_unique_values
0,73a32cd05,171
1,5a1183b8c,171
2,0345f70f2,172
3,770d3a11c,174
4,1b6c8debf,174
5,d0e4129cb,175
6,ae82c2c99,175
7,969d32625,175
8,05d95861b,176
9,47d17d395,177


In [86]:
list(np.unique(total_df['0345f70f2']))

[0.0,
 803.7826,
 1206.089,
 1359.8595,
 5171.236,
 9450.01,
 12352.65,
 17806.025,
 22310.176,
 42176.58,
 51357.773,
 64974.957,
 66327.234,
 66892.61,
 70023.54,
 76069.516,
 88588.19,
 91563.49,
 105960.625,
 118300.68,
 164015.86,
 190076.92,
 197379.97,
 200000.0,
 254866.55,
 274120.66,
 300000.0,
 308737.56,
 312922.72,
 331274.06,
 430510.5,
 434160.84,
 438125.5,
 473062.9,
 567642.9,
 587291.06,
 606900.06,
 610744.75,
 628142.06,
 653816.25,
 678753.0,
 699327.7,
 711692.9,
 730955.3,
 826669.1,
 897242.0,
 935929.25,
 1155814.1,
 1290127.0,
 1368174.5,
 1393299.1,
 1397361.8,
 1425579.9,
 1432492.1,
 1454100.4,
 1519529.4,
 1524202.9,
 1600000.0,
 1603226.5,
 1625040.5,
 1696029.6,
 2037632.8,
 2062911.8,
 2122893.2,
 2159746.0,
 2206298.8,
 2374194.2,
 2392479.2,
 2492903.0,
 2555766.0,
 2659240.8,
 2698456.8,
 2749115.2,
 2971911.0,
 3000000.0,
 3215571.8,
 3400000.0,
 3402614.5,
 3436993.5,
 3444014.5,
 3548127.0,
 3631125.5,
 3663383.0,
 3721479.0,
 3974248.0,
 4022700

In [78]:
len(np.unique(total_df['0943b38f3'].astype(np.float32).astype(np.int32)))

193

In [75]:
np.array(197929.4274814301).astype(np.float32)

array(197929.42, dtype=float32)

In [65]:
total_df['0943b38f3'].describe()

count    5.380100e+04
mean     5.924051e+04
std      2.395554e+06
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.463092e+08
Name: 0943b38f3, dtype: float64

In [66]:
total_df['0943b38f3'].astype(np.float32).describe()

count    5.380100e+04
mean     5.924049e+04
std      2.395264e+06
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.463092e+08
Name: 0943b38f3, dtype: float64

In [24]:
len(np.unique(df_train.target))

1413

In [32]:
len(np.unique(df_train.target.astype(np.int32)))

1374

In [35]:
df_train['target'].astype(np.int32).describe()

count    4.459000e+03
mean     5.944923e+06
std      8.234312e+06
min      3.000000e+04
25%      6.000000e+05
50%      2.260000e+06
75%      8.000000e+06
max      4.000000e+07
Name: target, dtype: float64

In [38]:

len(np.unique(total_df['48df886f9']))

287

In [42]:
len(np.unique(total_df['48df886f9'].astype(np.float32)))

287

In [43]:
np.unique(total_df['48df886f9'].astype(np.float32))[:11]

array([   0.     ,  401.95795, 1026.8024 , 1051.4142 , 1658.271  ,
       1754.9834 , 1932.4814 , 3261.9946 , 3283.7773 , 3363.9941 ,
       4572.558  ], dtype=float32)

In [44]:
np.unique(total_df['48df886f9'].astype(np.int32))[:11]

array([   0,  401, 1026, 1051, 1658, 1754, 1932, 3261, 3283, 3363, 4572])

In [89]:
%%time
# Go through the columns one at a time (can't do it all at once for this dataset)
total_df_orig = deepcopy(total_df) 
total_df_all = deepcopy(total_df)              
for col in total_df.columns:
    
    # Detect outliers in this column
    data = total_df[col].values
    data_mean, data_std = np.mean(data), np.std(data)
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    
    # If there are crazy high values, do a log-transform
    if len(outliers) > 0:
        non_zero_idx = data != 0
        total_df.loc[non_zero_idx, col] = np.log(data[non_zero_idx])
    
    # Scale non-zero column values
    nonzero_rows = total_df[col] != 0
    total_df.loc[nonzero_rows, col] = minmax_scale(total_df.loc[nonzero_rows, col])
    
    # Scale all column values
    total_df_all[col] = minmax_scale(total_df_all[col])
    gc.collect()

In [90]:
total_df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float32(4720), float64(10)
memory usage: 973.2 MB


In [91]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float32(4720), float64(10)
memory usage: 973.2 MB


In [92]:
total_df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 49341
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float32(4720), float64(10)
memory usage: 973.2 MB


In [125]:
df_train['target'].shape

(4459,)

In [102]:
list(np.unique((total_df['0345f70f2']*100000).astype(np.uint16)))

[0,
 623,
 698,
 1257,
 1706,
 1713,
 1793,
 1810,
 2050,
 2236,
 2307,
 2434,
 2963,
 3061,
 3087,
 3268,
 3647,
 3745,
 3848,
 4234,
 4274,
 4756,
 4925,
 5982,
 6210,
 6250,
 6444,
 7082,
 7211,
 7394,
 7742,
 7857,
 7976,
 8632,
 8741,
 8917,
 9066,
 9208,
 9320,
 9395,
 10394,
 10532,
 10646,
 10744,
 10771,
 11551,
 11649,
 11861,
 11863,
 12098,
 12456,
 12654,
 12967,
 13142,
 13152,
 13211,
 13683,
 14396,
 14598,
 14635,
 14758,
 14815,
 14991,
 15014,
 16865,
 16962,
 17057,
 17229,
 17510,
 17657,
 18007,
 18087,
 18128,
 18206,
 18747,
 18804,
 18846,
 19129,
 19235,
 19241,
 19846,
 19854,
 19885,
 20435,
 21604,
 21827,
 21948,
 21954,
 22003,
 22120,
 22208,
 22276,
 22596,
 23413,
 24878,
 24948,
 24964,
 26107,
 26433,
 26764,
 28062,
 28265,
 28474,
 28714,
 28736,
 31440,
 31892,
 33478,
 34464,
 35372,
 35538,
 35606,
 35975,
 36642,
 37868,
 38135,
 39311,
 40198,
 42829,
 44016,
 44320,
 44426,
 46379,
 46965,
 47691,
 47923,
 48031,
 48490,
 50600,
 50668,
 5074

In [104]:
# Train and test
train_idx = range(0, len(df_train))
test_idx = range(len(df_train), len(total_df))

In [116]:
unscaled_data_store = './unscaled_data_store.pkl'
print( "Saving unscaled data...")
with open(os.path.abspath(unscaled_data_store), 'wb') as f:
    pickle.dump((total_df_orig, df_train['target'], train_idx, test_idx),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(unscaled_data_store))


Saving unscaled data...
Saved to C:\santander-value-prediction-challenge\leonid\unscaled_data_store.pkl


In [115]:
minmaxscaled_data_store = './minmaxscaled_data_store.pkl'
print( "Saving minmax_scaled data...")
with open(os.path.abspath(minmaxscaled_data_store), 'wb') as f:
    pickle.dump((total_df_all, df_train['target'], train_idx, test_idx),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(minmaxscaled_data_store))

Saving minmax_scaled data...
Saved to C:\santander-value-prediction-challenge\leonid\minmaxscaled_data_store.pkl


In [127]:
minmaxscaled_logtrans_data_store = './minmaxscaled_logtrans_data_store.pkl'
print( "Saving minmax_scaled data...")
with open(os.path.abspath(minmaxscaled_logtrans_data_store), 'wb') as f:
    pickle.dump((total_df, df_train['target'], train_idx, test_idx),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(minmaxscaled_logtrans_data_store))

Saving minmax_scaled data...
Saved to C:\santander-value-prediction-challenge\leonid\minmaxscaled_logtrans_data_store.pkl


In [131]:
# Loading from pickle example
minmaxscaled_logtrans_data_store = './minmaxscaled_logtrans_data_store.pkl'
if os.path.isfile(minmaxscaled_logtrans_data_store):
    print("loading data from pickle file", minmaxscaled_logtrans_data_store)
    with open(os.path.abspath(minmaxscaled_logtrans_data_store), 'rb') as f:
        total_df, Y, train_idx_rng, test_idx_rng = pickle.load(f, encoding='bytes')
        print('total_df:', type(total_df), total_df.shape)
        print('Y:', type(Y), Y.shape)
        print('train_idx_rng:', type(train_idx_rng), 'start:', train_idx_rng.start,
              'stop:', train_idx_rng.stop, 'step:', train_idx_rng.step)
        print('test_idx_rng:', type(test_idx_rng), 'start:', test_idx_rng.start,
              'stop:', test_idx_rng.stop, 'step:', test_idx_rng.step)

loading data from pickle file ./minmaxscaled_logtrans_data_store.pkl
total_df: <class 'pandas.core.frame.DataFrame'> (53801, 4730)
Y: <class 'pandas.core.series.Series'> (4459,)
train_idx_rng: <class 'range'> start: 0 stop: 4459 step: 1
test_idx_rng: <class 'range'> start: 4459 stop: 53801 step: 1
