In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc

%matplotlib inline

### Information about the data set

In [2]:
categorical = pd.read_csv('./data/train_categorical.csv', nrows=20)
print("Categorical Data Size: ", categorical.shape[1], 'columns')
print(categorical.columns, '\n')

numerical = pd.read_csv('./data/train_numeric.csv', nrows=20)
print("Numerical Data Size: ", numerical.shape[1], 'columns')
print(numerical.columns, '\n')

date = pd.read_csv('./data/train_date.csv', nrows=20)
print("Date Data Size: ", date.shape[1], 'columns')
print(date.columns, '\n')

print('Total Columns: ', categorical.shape[1] + numerical.shape[1] + date.shape[1] - 2)

Categorical Data Size:  2141 columns
Index(['Id', 'L0_S1_F25', 'L0_S1_F27', 'L0_S1_F29', 'L0_S1_F31', 'L0_S2_F33',
       'L0_S2_F35', 'L0_S2_F37', 'L0_S2_F39', 'L0_S2_F41',
       ...
       'L3_S49_F4225', 'L3_S49_F4227', 'L3_S49_F4229', 'L3_S49_F4230',
       'L3_S49_F4232', 'L3_S49_F4234', 'L3_S49_F4235', 'L3_S49_F4237',
       'L3_S49_F4239', 'L3_S49_F4240'],
      dtype='object', length=2141) 

Numerical Data Size:  970 columns
Index(['Id', 'L0_S0_F0', 'L0_S0_F2', 'L0_S0_F4', 'L0_S0_F6', 'L0_S0_F8',
       'L0_S0_F10', 'L0_S0_F12', 'L0_S0_F14', 'L0_S0_F16',
       ...
       'L3_S50_F4245', 'L3_S50_F4247', 'L3_S50_F4249', 'L3_S50_F4251',
       'L3_S50_F4253', 'L3_S51_F4256', 'L3_S51_F4258', 'L3_S51_F4260',
       'L3_S51_F4262', 'Response'],
      dtype='object', length=970) 

Date Data Size:  1157 columns
Index(['Id', 'L0_S0_D1', 'L0_S0_D3', 'L0_S0_D5', 'L0_S0_D7', 'L0_S0_D9',
       'L0_S0_D11', 'L0_S0_D13', 'L0_S0_D15', 'L0_S0_D17',
       ...
       'L3_S50_D4246', 'L3_S50_D

### Load the data by chunks

In [2]:
def get_data(chunksize, num_path, cat_path, date_path):
    """ function to load the csv files by chunk """
    numeric_reader = pd.read_csv(num_path, chunksize=chunksize, low_memory=False)
    categorical_reader = pd.read_csv(cat_path, chunksize=chunksize, low_memory=False)
    date_reader = pd.read_csv(date_path, chunksize=chunksize, low_memory=False)
    reader = zip(numeric_reader, categorical_reader, date_reader)
    
    first = True
    for numeric, categorical, date in reader:
        categorical.drop('Id', axis=1, inplace=True)
        date.drop('Id', axis=1, inplace=True)
        # combine three category sub_set
        data = pd.concat([numeric, categorical, date], axis=1)
        # positive data and negative data
        pos_data = data[data['Response'] == 1]
        neg_data = data[data['Response'] == 0]
        if first:
            positive = pos_data
            negative = neg_data
            first = False
        else:
            positive = pd.concat([positive, pos_data])
            negative = pd.concat([negative, neg_data])
        # show current process
        print(pos_data.shape, positive.shape, neg_data.shape, negative.shape)
                        
    return positive, negative

In [None]:
train_pos, train_neg = get_data(10000, './data/train_numeric.csv', 
                                       './data/train_categorical.csv', 
                                       './data/train_date.csv')

(53, 4266) (53, 4266) (9947, 4266) (9947, 4266)
(59, 4266) (112, 4266) (9941, 4266) (19888, 4266)
(57, 4266) (169, 4266) (9943, 4266) (29831, 4266)
(50, 4266) (219, 4266) (9950, 4266) (39781, 4266)
(52, 4266) (271, 4266) (9948, 4266) (49729, 4266)
(52, 4266) (323, 4266) (9948, 4266) (59677, 4266)
(61, 4266) (384, 4266) (9939, 4266) (69616, 4266)
(59, 4266) (443, 4266) (9941, 4266) (79557, 4266)
(71, 4266) (514, 4266) (9929, 4266) (89486, 4266)
(54, 4266) (568, 4266) (9946, 4266) (99432, 4266)
(59, 4266) (627, 4266) (9941, 4266) (109373, 4266)
(53, 4266) (680, 4266) (9947, 4266) (119320, 4266)
(38, 4266) (718, 4266) (9962, 4266) (129282, 4266)
(62, 4266) (780, 4266) (9938, 4266) (139220, 4266)
(55, 4266) (835, 4266) (9945, 4266) (149165, 4266)
(48, 4266) (883, 4266) (9952, 4266) (159117, 4266)
(75, 4266) (958, 4266) (9925, 4266) (169042, 4266)
(68, 4266) (1026, 4266) (9932, 4266) (178974, 4266)
(51, 4266) (1077, 4266) (9949, 4266) (188923, 4266)
(52, 4266) (1129, 4266) (9948, 4266) (198