# Convert categorical to numeric

In [4]:
import os
import time
import pickle
import datetime


import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, vstack

import c

%matplotlib inline

## Covert categorical data to numbers
(Saves approx 25% memory usage)

In [3]:
# Read all data
a=time.time()
b=pd.read_csv(os.path.join(c.BASE_PATH,'test_categorical.csv'), dtype='object')
print time.time()-a

375.940962076


In [6]:
# Batch wise stripping of elements
i_s = [0, 500, 1000, 1500, 2000]
i_e = [500, 1000, 1500, 2000, 2141]

for s,e in zip(i_s, i_e):
    print s, e
    #a=time.time()
    b.iloc[:,s:e]=b.iloc[:,s:e].apply(lambda x: x.str.lstrip('T'), axis=0)
    #print (time.time()-a)
    #print (time.time()-a)/500

0 500
100.53592205
0.201073518276
500 1000
105.001366854
0.210008617878
1000 1500
100.97073698
0.201954011917
1500 2000
100.874717951
0.20175117588
2000 2141
27.5970749855
0.0551967420578


In [7]:
b.to_csv(os.path.join(c.BASE_PATH,'test_categorical_to_num.csv'), index=False)

In [9]:
del b

## Convert to sparse and check memory

In [None]:
def memory_usage(var):
    ''' Returns estimate of memory usage using pickle trick '''
    tmp_file_path = os.path.join(c.BASE_PATH, 'tmp_dump_file.pkl')
    
    with open(tmp_file_path,'wb') as f:
        pickle.dump(var, f, pickle.HIGHEST_PROTOCOL)
        
    file_size = os.path.getsize(tmp_file_path)/1024/1024
    os.remove(tmp_file_path)
    
    return file_size

def read_csv_as_sparse(csv_file, chuncksize=10000, verbose=False):
    ''' Read csv and return sparse matrix'''
    
    print('Reading {} with size {} MB'.format(csv_file, os.path.getsize(csv_file)/1024/1024))
    
    # Check for columns to read
    quick_scan = pd.read_csv(csv_file,
                         nrows=1, 
                         dtype=np.float32,
                         index_col = 0)
    
    if 'Response' in quick_scan.columns:
        if verbose:
            print('Excluding Response column')
        columns_names_used = quick_scan.columns[0:quick_scan.shape[1]-1]
    else:
        columns_names_used = quick_scan.columns[0:quick_scan.shape[1]]
    
    
    reader = pd.read_csv(csv_file,
                         chunksize=chuncksize, 
                         dtype=np.float32,
                         index_col=0,
                         usecols=columns_names_used)
    
    
    ids = pd.read_csv(csv_file, usecols=[0])
    
    if verbose:
        for i,ch in enumerate(reader):
            print ch.columns
            
            if i==0:
                csr = csr_matrix(ch.fillna(0))
            else:
                csr = vstack([csr, csr_matrix(ch.fillna(0))], format='csr')

            if not i % 10:
                print('Doing chunck: {} | status: {} elements'.format(i, csr.getnnz()))
    else:
        csr = vstack([csr_matrix(ch.fillna(0)) for ch in reader], format='csr')
    
    print('Sparse matrix has {} elements and {} MB memory usage'.format(csr.getnnz(),
                                                                       memory_usage(csr)))
    
    return csr, ids, columns_names_used

def read_last_column(csv_file):
    ''' Reads last column in csv file'''
    sample = pd.read_csv(os.path.join(c.BASE_PATH,csv_file), nrows=1)
    
    return pd.read_csv(os.path.join(c.BASE_PATH,csv_file), usecols=[0,sample.shape[1]-1], index_col=0)

def convert_data_file_to_pickle(filename, verbose=False):
    ''' Reads full csv file, converts to sparse and stores using pickle with metadata'''
    a = time.time()
    csr, ids, f_names = read_csv_as_sparse(os.path.join(c.BASE_PATH, filename + '.csv'), 
                                  chuncksize=100000,
                                  verbose=verbose)
    print('Reading data as sparse took {}s'.format(time.time()-a))
    
    y = read_last_column(os.path.join(c.BASE_PATH, 'train_numeric.csv'))
    
    output = {'data': {'ids': ids, 'y': y, 'features': csr, 'feature_names': f_names},
              'creation_date':datetime.datetime.now(),
              'created_by': 'joostgp',
              'script': 'convert_data_to_sparse'}
    
    save_path = os.path.join(c.BASE_PATH,filename + '.pkl')
    
    with open(save_path,'wb') as f:
        pickle.dump(output, f, pickle.HIGHEST_PROTOCOL)
        
    print('Results stored in {}'.format(save_path))
    

In [30]:
data_files = ['train_categorical_to_num',
              'train_numeric',
              'train_date',
              'test_categorical_to_num',
              'test_numeric',
              'test_date']

In [None]:
for f in data_files:
     convert_data_file_to_pickle(f, verbose=False)

Reading /Volumes/My Book/kaggle_bosch/train_categorical_to_num.csv with size 2489 MB


In [None]:
'''
Reading /Volumes/My Book/kaggle_bosch/train_categorical_to_num.csv with size 2489 MB
Sparse matrix has 67650912 elements and 520 MB memory usage
Reading data as sparse took 300.818804979s
Results stored in /Volumes/My Book/kaggle_bosch/train_categorical_to_num.pkl
Reading /Volumes/My Book/kaggle_bosch/train_numeric.csv with size 2040 MB
Sparse matrix has 173670587 elements and 1329 MB memory usage
Reading data as sparse took 139.659316063s
Results stored in /Volumes/My Book/kaggle_bosch/train_numeric.pkl
Reading /Volumes/My Book/kaggle_bosch/train_date.csv with size 2759 MB
Sparse matrix has 242306507 elements and 1853 MB memory usage
Reading data as sparse took 205.178919077s
Results stored in /Volumes/My Book/kaggle_bosch/train_date.pkl
Reading /Volumes/My Book/kaggle_bosch/test_categorical_to_num.csv with size 2489 MB
Sparse matrix has 67615904 elements and 520 MB memory usage
Reading data as sparse took 385.804188967s
Results stored in /Volumes/My Book/kaggle_bosch/test_categorical_to_num.pkl
Reading /Volumes/My Book/kaggle_bosch/test_numeric.csv with size 2038 MB
Sparse matrix has 173621481 elements and 1329 MB memory usage
Reading data as sparse took 138.873883963s
Results stored in /Volumes/My Book/kaggle_bosch/test_numeric.pkl
Reading /Volumes/My Book/kaggle_bosch/test_date.csv with size 2759 MB
Sparse matrix has 242237132 elements and 1852 MB memory usage
Reading data as sparse took 230.876074076s
Results stored in /Volumes/My Book/kaggle_bosch/test_date.pkl
'''