In [1]:
# basic set of imports
import re, operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

from IPython.display import display, HTML
import datetime, pickle
import time
import tables
%matplotlib inline

In [2]:
datasets = ["test","train"]

In [None]:

num_types = ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64','float32', 'float64','object']
# 'bool' - rhdf5 has problems translating bools from hdf5 files

dtypes_dict = dict()
# keys = col, values = (0 if type initially int64, 9 if type initially float64, 11 if type initially O)
init_type = {np.dtype('int64'):num_types.index('uint8'),
             np.dtype('float64'):num_types.index('float32'),
             np.dtype('object'):num_types.index('object')}

datetime_col_errors_dict = dict()

for dataset in datasets: #
    
    csv_chunks = pd.read_csv(r"..\expedia_data\{0}.csv".format(dataset), chunksize=50000 )

    count =0
    for chunk in csv_chunks:
        count +=1
        print("chunk: ",count)
        for col in chunk.columns:

            if col not in dtypes_dict.keys():
                dtypes_dict[col] = init_type[ chunk[col].dtype ]

            # float64 and int 64
            if chunk[col].dtype in [np.dtype('int64'),np.dtype('float64')]:
                a = False
                while(dtypes_dict[col]<11 and a==False):

                    a = all( chunk[ col ].dropna().astype( num_types[ dtypes_dict[col] ] )==chunk[ col ].dropna() )
                    if a==False:
                        dtypes_dict[col] += 1
            # object - all assumed to be datetime
            elif chunk[col].dtype == np.dtype('object'):
                
                
                # faster to go through the entire thing looking for issues ... or to try to convert
                try:
                    pd.to_datetime(chunk[col])
                except ValueError:
                    for el in chunk[col]:
                        try:
                            pd.to_datetime(el)
                        except ValueError:
                            # add value to DataFrame - id, col, value
    #                         if no_datetime_errors_found == True:
    #                             datetime_col_errors = pd.DataFrame( [el.index[0],col,el] )
    #                             datetime_col_errors.columns=('id','column','value')


    #                             no_datetime_errors_found = False:
    #                         else:
    #                             datetime_col_errors.loc[len(datetime_col_errors)] = [el.index[0],col,el]  

                            if col not in datetime_col_errors_dict.keys():
                                datetime_col_errors_dict[col] = [el]
                            else:
                                if el not in datetime_col_errors_dict[col]:
                                    datetime_col_errors_dict[col].append(el)

                    # now convert and coerce column
                    # nope - wrong time to do conversion!
                    # WARNING if <20% of dates work?
                 
                    
            else:
                raise ValueError('chunk[col] not equal to float64, int64, object')
                



chunk:  1
chunk:  2
chunk:  3
chunk:  4
chunk:  5
chunk:  6
chunk:  7
chunk:  8
chunk:  9
chunk:  10
chunk:  11
chunk:  12
chunk:  13
chunk:  14
chunk:  15
chunk:  16
chunk:  17
chunk:  18
chunk:  19
chunk:  20
chunk:  21
chunk:  22
chunk:  23
chunk:  24
chunk:  25
chunk:  26
chunk:  27
chunk:  28
chunk:  29
chunk:  30
chunk:  31
chunk:  32
chunk:  33
chunk:  34
chunk:  35
chunk:  36
chunk:  37
chunk:  38
chunk:  39
chunk:  40
chunk:  41
chunk:  42
chunk:  43
chunk:  44
chunk:  45
chunk:  46
chunk:  47
chunk:  48
chunk:  49
chunk:  50
chunk:  51
chunk:  1
chunk:  2
chunk:  3
chunk:  4
chunk:  5
chunk:  6
chunk:  7
chunk:  8
chunk:  9
chunk:  10
chunk:  11
chunk:  12
chunk:  13
chunk:  14
chunk:  15
chunk:  16
chunk:  17
chunk:  18
chunk:  19
chunk:  20
chunk:  21
chunk:  22
chunk:  23
chunk:  24
chunk:  25
chunk:  26
chunk:  27
chunk:  28
chunk:  29
chunk:  30
chunk:  31
chunk:  32
chunk:  33
chunk:  34
chunk:  35
chunk:  36
chunk:  37
chunk:  38
chunk:  39
chunk:  40
chunk:  41
chunk:

In [None]:
for dataset in datasets: 
    start = time.clock()

    started = False
    count = 0

    dtypes_dict_str = {key: num_types[value] for (key, value) in dtypes_dict.items()}

    date_columns = [key for key, value in dtypes_dict_str.items() if value == 'object']

#     dataset = 'train'

    csv_chunks = pd.read_csv(r"..\expedia_data\{0}.csv".format(dataset),
                             chunksize=500000,
                             dtype = dtypes_dict_str,
                             parse_dates= date_columns,
                             infer_datetime_format=True,
                             na_values= datetime_col_errors_dict)

    for chunk in csv_chunks:
        print("chunk: ",count)    
        if count == 0:
            df=chunk
            #started=True

        else:
            df = df.append(chunk,verify_integrity=True, ignore_index=True)
            
        if count % 10==0:
            df.info(memory_usage='deep')


        count = count + 1

    # "date_time", "srch_ci", "srch_co"

    #df = pd.concat(chunk for chunk in csv_chunks)

    end = time.clock()
    print(end - start)
    
    print(df.dtypes)

    df.to_hdf(r"..\expedia_data\{0}_as_hdf.h5".format(dataset),'train',format='fixed',mode='w')
    
    
    del(df)

In [None]:
# filename = train_as_hdf_smaller
# df.to_hdf(r"..\expedia_data\{0}.h5".format(filename),'train',format='fixed',mode='w', complevel=9,complib='bzip2')

In [None]:

filename = 'train_as_hdf'
train = pd.read_hdf(r"..\expedia_data\{0}.h5".format(filename))
train.info(memory_usage='deep')

In [None]:

filename = 'test_as_hdf'
test = pd.read_hdf(r"..\expedia_data\{0}.h5".format(filename))
test.info(memory_usage='deep')

In [None]:
train_bookings_only = train[train.is_booking==True]

In [None]:
len(train_bookings_only)

In [None]:
filename = 'train_bookings_only'

train_bookings_only.to_csv(r"..\expedia_data\{0}.csv".format(filename) , index=False)

In [None]:
filename = 'train_bookings_only'
train_bookings_only_test  = pd.read_csv( r"..\expedia_data\{0}.csv".format(filename) )
train_bookings_only_test.info(memory_usage='deep')

In [None]:
import gc
gc.collect