# Analysis of Dublin Bus Big Data [Optimizing Memory]

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

## Reading RT_TRIPS DATA

In [3]:
trips_df = pd.read_csv('rt_trips_full.csv')

In [4]:
trips_df.head(10)

Unnamed: 0,datasource,dayofservice,tripid,lineid,routeid,direction,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep,basin,tenderlot,suppressed,justificationid,lastupdate,note
0,DB,09-FEB-16 00:00:00,2824642,145,145_105,2,45668,40800,45375.0,40782.0,BasDef,,,,12-APR-16 09:35:20,",2426901,"
1,DB,09-FEB-16 00:00:00,2826717,9,9_7,2,65553,60600,66258.0,60582.0,BasDef,,,,12-APR-16 09:35:20,",2589266,"
2,DB,09-FEB-16 00:00:00,2826730,54A,54A_12,2,65951,62100,66053.0,62078.0,BasDef,,,,12-APR-16 09:35:20,",1717180,"
3,DB,09-FEB-16 00:00:00,2826743,7,7_51,1,54763,50400,54854.0,50383.0,BasDef,,,,12-APR-16 09:35:20,",2057499,"
4,DB,09-FEB-16 00:00:00,2812908,39,39_20,1,27375,22920,27318.0,22931.0,BasDef,,,,12-APR-16 09:35:20,",2422848,"
5,DB,09-FEB-16 00:00:00,2813717,56A,56A_30,2,65940,62100,66737.0,62486.0,BasDef,,,,12-APR-16 09:35:20,",1724138,"
6,DB,09-FEB-16 00:00:00,2815965,37,37_14,1,83363,80100,82954.0,80164.0,BasDef,,,,12-APR-16 09:35:20,",2167457,"
7,DB,09-FEB-16 00:00:00,2815967,11,11_40,1,31658,27000,33997.0,27009.0,BasDef,,,,12-APR-16 09:35:20,",2426206,"
8,DB,09-FEB-16 00:00:00,2815970,11,11_42,2,49346,44700,50104.0,44720.0,BasDef,,,,12-APR-16 09:35:20,",2426209,"
9,DB,09-FEB-16 00:00:00,2818038,63,63_18,1,81659,79800,82199.0,79802.0,BasDef,,,,12-APR-16 09:35:20,",1749203,"


In [5]:
trips_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2041415 entries, 0 to 2041414
Data columns (total 16 columns):
datasource         object
dayofservice       object
tripid             int64
lineid             object
routeid            object
direction          int64
plannedtime_arr    int64
plannedtime_dep    int64
actualtime_arr     float64
actualtime_dep     float64
basin              object
tenderlot          float64
suppressed         float64
justificationid    float64
lastupdate         object
note               object
dtypes: float64(5), int64(4), object(7)
memory usage: 1.0 GB


In [6]:
# Analysis of the memory usage by the data
# sns.set_style("whitegrid")
dtype_df = pd.DataFrame()
dttype=[]
size_c = []
for dtype in ['float','int64','object']:
    selected_dtype = trips_df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    dttype.append(dtype)
    size_c.append(mean_usage_mb)
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
dtype_df['Type']=dttype
dtype_df['Size Consumed'] = size_c

Average memory usage for float columns: 12.98 MB
Average memory usage for int64 columns: 12.46 MB
Average memory usage for object columns: 111.99 MB


In [7]:
sns.barplot(x='Type',y='Size Consumed',data=dtype_df)
plt.savefig('sizeConsumptionByType.png',transparent=True)

NameError: name 'sns' is not defined

We can see that int has no effect on our data consumption. The major effect is float and object columns. We will drop those columns that aren't of any use and will downcast the rest columns.

### Int Columns

In [8]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [9]:
trips_df_int = trips_df.select_dtypes(include=['int64'])
converted_int = trips_df_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(trips_df_int))
print(mem_usage(converted_int))

62.30 MB
25.31 MB


We can see a drop of 40% memory usage

In [10]:
compare_int = pd.concat([trips_df_int.dtypes,converted_int.dtypes],axis=1)
compare_int.columns = ['before','after']
compare_int.apply(pd.Series.value_counts)

Unnamed: 0,before,after
uint8,,1.0
uint32,,3.0
int64,4.0,


### Float Columns

In [11]:
trips_df_float = trips_df.select_dtypes(include=['float'])
converted_float = trips_df_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(trips_df_float))
print(mem_usage(converted_float))

77.87 MB
38.94 MB


We can see a drop of 50% consumption by just downcasting the columns

In [12]:
compare_floats = pd.concat([trips_df_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

Unnamed: 0,before,after
float32,,5.0
float64,5.0,


In [13]:
optimized_trips_df = trips_df.copy()
optimized_trips_df[converted_int.columns] = converted_int
optimized_trips_df[converted_float.columns] = converted_float

In [14]:
print(mem_usage(trips_df))
print(mem_usage(optimized_trips_df))

1036.10 MB
960.18 MB


While the size of our numeric columns were reduced at a rate of approx 45%.. the overall size of the df has not quite been affected that much. This is due to the object columns.

### Object Columns

In [15]:
df_trips_object = trips_df.select_dtypes(include=['object']).copy()

In [16]:
df_trips_object.describe().T

Unnamed: 0,count,unique,top,freq
datasource,2041415,1,DB,2041415
dayofservice,2041415,334,27-FEB-17 00:00:00,7101
lineid,2041415,125,46A,72464
routeid,2041415,545,46A_74,35060
basin,2041415,1,BasDef,2041415
lastupdate,2041415,334,16-MAR-17 11:13:36,7101
note,2041415,32562,",2421044,",230


Pandas categorical type columns consume less space as compared to object columns. Any column that has less than 50% unique values should be converter to category

In [17]:
converted_obj = pd.DataFrame()

for col in df_trips_object.columns:
    num_unique_values = len(df_trips_object[col].unique())
    num_total_values = len(df_trips_object[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = df_trips_object[col].astype('category')
    else:
        converted_obj.loc[:,col] = df_trips_object[col]

In [18]:
print(mem_usage(df_trips_object))
print(mem_usage(converted_obj))

895.93 MB
24.91 MB


We can see here a drop in memory usage of about 97%

In [21]:
compare_obj = pd.concat([df_trips_object.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

Unnamed: 0,before,after
object,7.0,
category,,1.0
category,,1.0
category,,1.0
category,,1.0
category,,1.0
category,,1.0
category,,1.0


In [22]:
optimized_trips_df[converted_obj.columns] = converted_obj

In [20]:
print(mem_usage(optimized_trips_df))
print(mem_usage(trips_df))

89.16 MB
1036.10 MB


In [21]:
optimized_trips_df.columns

Index(['group', 'datasource', 'dayofservice', 'tripid', 'lineid', 'routeid',
       'direction', 'plannedtime_arr', 'plannedtime_dep', 'actualtime_arr',
       'actualtime_dep', 'basin', 'tenderlot', 'suppressed', 'justificationid',
       'lastupdate', 'note'],
      dtype='object')

Saving the dtypes for future loads

In [22]:
dtypes = optimized_trips_df.dtypes
dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]
column_types = dict(zip(dtypes_col, dtypes_type))
preview = first2pairs = {key:value for key,value in list(column_types.items())[:]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{   'actualtime_arr': 'float32',
    'actualtime_dep': 'float32',
    'basin': 'category',
    'datasource': 'category',
    'dayofservice': 'category',
    'direction': 'uint8',
    'group': 'uint16',
    'justificationid': 'float32',
    'lastupdate': 'category',
    'lineid': 'category',
    'note': 'category',
    'plannedtime_arr': 'uint32',
    'plannedtime_dep': 'uint32',
    'routeid': 'category',
    'suppressed': 'float32',
    'tenderlot': 'float32',
    'tripid': 'uint32'}


In [23]:
type(column_types)

dict

In [24]:
import json
with open('rt_trips_type.json','w') as fout:
    json.dump(column_types,fout)

## RT Leave Times data

In [25]:
lT_df = pd.read_csv('rt_leavetimes_full.csv')

In [26]:
lt_df_int = lT_df.select_dtypes(include=['int64'])
converted_int = lt_df_int.apply(pd.to_numeric,downcast='unsigned')
lt_df_float = lT_df.select_dtypes(include=['float64'])
converted_float = lt_df_float.apply(pd.to_numeric,downcast='float')

In [30]:
lT_df.head(10)

Unnamed: 0,group,datasource,dayofservice,tripid,progrnumber,stoppointid,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep,vehicleid,passengers,passengersin,passengersout,distance,suppressed,justificationid,lastupdate,note
0,2016,DB,31-JAN-16 00:00:00,2811044,63,3390,41820,41820,42920,42920,1000897,,,,,,,12-APR-16 05:30:20,
1,2016,DB,31-JAN-16 00:00:00,2820852,1,7158,69300,69300,69320,69320,1000903,,,,,,,12-APR-16 05:30:20,
2,2016,DB,31-JAN-16 00:00:00,2820852,5,7017,69498,69498,69532,69532,1000903,,,,,,,12-APR-16 05:30:20,
3,2016,DB,31-JAN-16 00:00:00,2820852,10,1893,69711,69711,69858,69858,1000903,,,,,,,12-APR-16 05:30:20,
4,2016,DB,31-JAN-16 00:00:00,2820853,16,1648,76822,76822,76636,76657,1000903,,,,,,,12-APR-16 05:30:20,
5,2016,DB,31-JAN-16 00:00:00,2820853,21,1805,76989,76989,76826,76826,1000903,,,,,,,12-APR-16 05:30:20,
6,2016,DB,31-JAN-16 00:00:00,2820853,36,1854,77792,77792,77700,77715,1000903,,,,,,,12-APR-16 05:30:20,
7,2016,DB,31-JAN-16 00:00:00,2820853,55,1899,78676,78676,78609,78609,1000903,,,,,,,12-APR-16 05:30:20,
8,2016,DB,31-JAN-16 00:00:00,2820853,60,7240,78798,78798,78844,78844,1000903,,,,,,,12-APR-16 05:30:20,
9,2016,DB,31-JAN-16 00:00:00,2820854,7,7030,80378,80378,80352,80352,1000903,,,,,,,12-APR-16 05:30:20,


In [27]:
print(mem_usage(lt_df_int))
print(mem_usage(converted_int))

6323.43 MB
2546.94 MB


In [29]:
compare_int = pd.concat([lt_df_int.dtypes,converted_int.dtypes],axis=1)
compare_int.columns = ['before','after']
compare_int.apply(pd.Series.value_counts)

Unnamed: 0,before,after
uint8,,1.0
uint16,,2.0
uint32,,6.0
int64,9.0,


In [28]:
print(mem_usage(lt_df_float))
print(mem_usage(converted_float))

4918.22 MB
2459.11 MB


In [32]:
df_lt_object = lT_df.select_dtypes(include=['object']).copy()

In [33]:
df_lt_object.describe().T

Unnamed: 0,count,unique,top,freq
datasource,92091588,1,DB,92091588
dayofservice,92091588,333,13-JAN-16 00:00:00,378931
lastupdate,92091587,334,02-MAR-16 19:12:56,378931


In [34]:
converted_obj = pd.DataFrame()
for col in df_lt_object.columns:
    converted_obj.loc[:,col] = df_lt_object[col].astype('category')

In [35]:
print(mem_usage(df_lt_object))
print(mem_usage(converted_obj))

18355.50 MB
439.19 MB


In [39]:
converted_float.dtypes

passengers         float32
passengersin       float32
passengersout      float32
distance           float32
suppressed         float32
justificationid    float32
note               float32
dtype: object

In [40]:
converted_int.dtypes

group              uint16
tripid             uint32
progrnumber         uint8
stoppointid        uint16
plannedtime_arr    uint32
plannedtime_dep    uint32
actualtime_arr     uint32
actualtime_dep     uint32
vehicleid          uint32
dtype: object

In [42]:
converted_obj.dtypes

datasource      category
dayofservice    category
lastupdate      category
dtype: object

In [None]:
# Whats the accurate memory usage of the dataframe
lT_df.info(memory_usage='deep')

In [None]:
# Analysis of the memory usage by the data
sns.set_style("whitegrid")
dtype_df = pd.DataFrame()
dttype=[]
size_c = []
for dtype in ['float64','int64','object']:
    selected_dtype = lT_df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    dttype.append(dtype)
    size_c.append(mean_usage_mb)
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
dtype_df['Type']=dttype
dtype_df['Size Consumed'] = size_c

In [None]:
sns.barplot(x='Type',y='Size Consumed',data=dtype_df)
plt.savefig('sizeConsumptionByTypeLeaveTimes.png',transparent=True)

### Int Columns

In [None]:
lt_df_int = lT_df.select_dtypes(include=['int64'])
converted_int = lt_df_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(lt_df_int))
print(mem_usage(converted_int))

In [40]:
compare_int = pd.concat([lt_df_int.dtypes,converted_int.dtypes],axis=1)
compare_int.columns = ['before','after']
compare_int.apply(pd.Series.value_counts)

Unnamed: 0,before,after
uint8,,1.0
uint16,,2.0
uint32,,6.0
int64,9.0,


### Float Columns

In [None]:
lt_df_float = lT_df.select_dtypes(include=['float'])
converted_float = lt_df_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(lt_df_float))
print(mem_usage(converted_float))

We can see a drop of 50% consumption by just downcasting the columns

In [None]:
compare_floats = pd.concat([lt_df_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

In [None]:
optimized_lt_df = lT_df.copy()
optimized_lt_df[converted_int.columns] = converted_int
optimized_lt_df[converted_float.columns] = converted_float

In [None]:
print(mem_usage(lT_df))
print(mem_usage(optimized_lt_df))

### Object

In [43]:
lT_df.columns

Index(['group', 'datasource', 'dayofservice', 'tripid', 'progrnumber',
       'stoppointid', 'plannedtime_arr', 'plannedtime_dep', 'actualtime_arr',
       'actualtime_dep', 'vehicleid', 'passengers', 'passengersin',
       'passengersout', 'distance', 'suppressed', 'justificationid',
       'lastupdate', 'note'],
      dtype='object')