# Practice Optimizing Dataframes and Processing in Chunks

This project will use the loans data from [Lending Club](https://www.lendingclub.com/), approved from 2007 to 2011.

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 99

preview = pd.read_csv('loans_2007.csv', nrows = 3)
preview

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,RENT,24000.0,Verified,Dec-2011,Fully Paid,n,credit_card,Computer,860xx,AZ,27.65,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,f,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,Jan-2015,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,Dec-2011,Charged Off,n,car,bike,309xx,GA,1.0,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,f,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,Apr-2013,119.66,Sep-2013,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,RENT,12252.0,Not Verified,Dec-2011,Fully Paid,n,small_business,real estate business,606xx,IL,8.72,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,f,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,Jun-2014,649.91,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


## Decide the size of chunk

Since the size of the entire data set is about 67MB of memory and we only have 10 MB of memory availale, we have to read the data in chunk.

First of all, we will investigate the memory usage for 1000 rows.

In [2]:
# MB for 100 rows
chunk_1000 = pd.read_csv('loans_2007.csv', nrows = 1000)
chunk_1000.memory_usage(deep=True).sum()/(1024**2)

1.5502548217773438

1000 rows will use around 0.4MB. Our goal is selecting the number of rows to converge on a memory usage under 5MB.

In [3]:
# number of rows
5/1.6*1000

3125.0

We will use 3000 rows for each chunk and check if all chunks are under 5MB.

In [4]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize = 3000)

print('Chunk size:\n')
total_size = 0
for chunk in chunk_iter:
    size = chunk.memory_usage(deep=True).sum()/(1024**2)
    total_size += size
    print(size)
    
print('\nTotal size: ',total_size, 'MB.')

Chunk size:

4.649059295654297
4.644805908203125
4.646563529968262
4.647915840148926
4.644108772277832
4.645991325378418
4.644582748413086
4.646951675415039
4.645077705383301
4.64512825012207
4.657840728759766
4.656707763671875
4.663515090942383
4.896956443786621
0.880854606628418

Total size:  66.21605968475342 MB.


## Chunk info

In [5]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize = 3000)

nrow = 0
for chunk in chunk_iter:
    nrow += len(chunk)
        
print('Number of rows: ', nrow)

Number of rows:  42538


In [6]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize = 3000)

num_col = []
obj_col = []

num_header = set()
obj_header = set()

for chunk in chunk_iter:
    temp = chunk.select_dtypes(include = np.number)
    num_col.append(temp.shape[1])
    num_header = num_header.union(set(temp.columns.to_list()))

    temp = chunk.select_dtypes(include = 'object')
    obj_col.append(temp.shape[1])
    obj_header = obj_header.union(set(temp.columns.to_list()))
    
print('Numerical columns in each chunk:\n',num_col,'\n')
print('String columns in each chunk:\n',obj_col,'\n') 

print('{} is string instead of numerical in the last 2 chunks.'.format(
    num_header.intersection(obj_header))
     )

Numerical columns in each chunk:
 [31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30] 

String columns in each chunk:
 [21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22] 

{'id'} is string instead of numerical in the last 2 chunks.


In [7]:
# Number of unique value in each column
unique_count = {}
unique_less_than_50_perc = []

for col in obj_header:
    col_unique = set()
    chunk_iter = pd.read_csv('loans_2007.csv', chunksize = 3000)
    for chunk in chunk_iter:
        col_unique = col_unique.union(
            set(chunk[col].unique())
        )
    
    if len(col_unique) < nrow/2:
        unique_less_than_50_perc.append(col)
        
    unique_count[col] = len(col_unique)
    
unique_count

{'emp_length': 12,
 'sub_grade': 36,
 'last_credit_pull_d': 109,
 'purpose': 15,
 'issue_d': 56,
 'zip_code': 838,
 'term': 3,
 'emp_title': 30659,
 'initial_list_status': 2,
 'home_ownership': 6,
 'int_rate': 395,
 'application_type': 2,
 'grade': 8,
 'verification_status': 4,
 'loan_status': 10,
 'revol_util': 1120,
 'title': 21265,
 'pymnt_plan': 3,
 'addr_state': 51,
 'last_pymnt_d': 104,
 'earliest_cr_line': 531,
 'id': 42538}

In [8]:
unique_less_than_50_perc

['emp_length',
 'sub_grade',
 'last_credit_pull_d',
 'purpose',
 'issue_d',
 'zip_code',
 'term',
 'initial_list_status',
 'home_ownership',
 'int_rate',
 'application_type',
 'grade',
 'verification_status',
 'loan_status',
 'revol_util',
 'title',
 'pymnt_plan',
 'addr_state',
 'last_pymnt_d',
 'earliest_cr_line']

In [9]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize = 3000)
float_na = []
for chunk in chunk_iter:
    temp = chunk.select_dtypes(include = 'float')
    float_na.append(temp.isna().sum())
    
float_na_all = pd.concat(float_na)
float_na_all.groupby(float_na_all.index).sum().sort_values()

member_id                        3
total_rec_int                    3
total_pymnt_inv                  3
total_pymnt                      3
revol_bal                        3
recoveries                       3
policy_code                      3
out_prncp_inv                    3
out_prncp                        3
total_rec_late_fee               3
loan_amnt                        3
last_pymnt_amnt                  3
total_rec_prncp                  3
funded_amnt_inv                  3
funded_amnt                      3
dti                              3
collection_recovery_fee          3
installment                      3
annual_inc                       7
inq_last_6mths                  32
total_acc                       32
delinq_2yrs                     32
pub_rec                         32
delinq_amnt                     32
open_acc                        32
acc_now_delinq                  32
tax_liens                      108
collections_12_mths_ex_med     148
chargeoff_within_12_

All `float` columns have at least 1 missing value, therefore no columns can be tranformed into `integer`.

We will convert some of the columns to the category and date type. Meanwhile the columns that contain numeric values will be converted to the float type in order to reduce the required memory.

In [10]:
preview[obj_header]

Unnamed: 0,emp_length,sub_grade,last_credit_pull_d,purpose,issue_d,zip_code,term,emp_title,initial_list_status,home_ownership,int_rate,application_type,grade,verification_status,loan_status,revol_util,title,pymnt_plan,addr_state,last_pymnt_d,earliest_cr_line,id
0,10+ years,B2,Jun-2016,credit_card,Dec-2011,860xx,36 months,,f,RENT,10.65%,INDIVIDUAL,B,Verified,Fully Paid,83.7%,Computer,n,AZ,Jan-2015,Jan-1985,1077501
1,< 1 year,C4,Sep-2013,car,Dec-2011,309xx,60 months,Ryder,f,RENT,15.27%,INDIVIDUAL,C,Source Verified,Charged Off,9.4%,bike,n,GA,Apr-2013,Apr-1999,1077430
2,10+ years,C5,Jun-2016,small_business,Dec-2011,606xx,36 months,,f,RENT,15.96%,INDIVIDUAL,C,Not Verified,Fully Paid,98.5%,real estate business,n,IL,Jun-2014,Nov-2001,1077175


*revol_util* and *int_rate* can be converted into numeric by removing % and *term* by removing 'months'

*last_credit_pull_d*, *issue_d*, *last_pymnt_d* and *earliest_cr_line* can be converted into `Date` type.

In [20]:
col_to_date = ['last_credit_pull_d', 'issue_d', 'last_pymnt_d', 'earliest_cr_line']

col_to_category = {
    'emp_length':'category',
    'sub_grade':'category',
    'purpose':'category',
    'initial_list_status':'category',
    'home_ownership':'category',
    'application_type':'category',
    'grade':'category',
    'verification_status':'category',
    'loan_status':'category',
    'pymnt_plan':'category'
}

In [19]:
chunk_iter = pd.read_csv('loans_2007.csv', 
                         chunksize=3000, 
                         parse_dates = col_to_date,
                         dtype = col_to_category)

for chunk in chunk_iter:
    # convert term, revol_util and int_rate into numeric
    chunk['term'] = pd.to_numeric(chunk['term'].str.strip(' ').str.replace('months', ''))
    chunk['revol_util'] = pd.to_numeric(chunk['revol_util'].str.strip('%'))
    chunk['int_rate'] = pd.to_numeric(chunk['int_rate'].str.strip('%'))

chunk.dtypes

id                                    object
member_id                            float64
loan_amnt                            float64
funded_amnt                          float64
funded_amnt_inv                      float64
term                                 float64
int_rate                             float64
installment                          float64
grade                               category
sub_grade                           category
emp_title                             object
emp_length                          category
home_ownership                      category
annual_inc                           float64
verification_status                 category
issue_d                       datetime64[ns]
loan_status                         category
pymnt_plan                          category
purpose                             category
title                                 object
zip_code                              object
addr_state                            object
dti       

In [25]:
chunk_iter = pd.read_csv('loans_2007.csv', 
                         chunksize=3000, 
                         parse_dates = col_to_date,
                         dtype = col_to_category)

total_size = 0

for chunk in chunk_iter:
    # convert term, revol_util and int_rate into numeric
    chunk['term'] = pd.to_numeric(chunk['term'].str.strip(' ').str.replace('months', ''))
    chunk['revol_util'] = pd.to_numeric(chunk['revol_util'].str.strip('%'))
    chunk['int_rate'] = pd.to_numeric(chunk['int_rate'].str.strip('%'))

    total_size += chunk.memory_usage(deep=True).sum()/(1024**2)

print('Chunk size:', total_size, 'MB')

Chunk size: 23.910348892211914 MB


We reduce the memory size from 66.2 MB to 23.9MB by change the column types.