In [4]:
m = pd.read_csv('train.csv')
m['Upc'] = m['Upc'].astype('str')
m.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929.0,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980.0,1,SHOES,8931.0
2,30,7,Friday,7410811099.0,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510.0,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744.0,2,PAINT AND ACCESSORIES,1017.0


# Memory usage Overall

In [5]:
m.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647054 entries, 0 to 647053
Data columns (total 7 columns):
TripType                 647054 non-null int64
VisitNumber              647054 non-null int64
Weekday                  647054 non-null object
Upc                      647054 non-null object
ScanCount                647054 non-null int64
DepartmentDescription    645693 non-null object
FinelineNumber           642925 non-null float64
dtypes: float64(1), int64(3), object(3)
memory usage: 145.1 MB


# Avg Memory for each type

In [11]:
m.select_dtypes(include=[int]).memory_usage() / 1024**2

Index          0.000076
TripType       4.936630
VisitNumber    4.936630
ScanCount      4.936630
dtype: float64

In [6]:
for dtype in ['float', 'int', 'object']:
    selected_dtype = m.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Avg memory usage for {} col: {:03.2f} MB".format(dtype, mean_usage_mb))
    

Avg memory usage for float col: 2.47 MB
Avg memory usage for int col: 3.70 MB
Avg memory usage for object col: 31.35 MB


# Subtype for the most common pandas types
- unit: unsigned integers -> storing positive values
- int: signed integers

In [16]:
data = {'memory usage':['1b','2b','4b','8b','variable'],
'float':['','float16','float32','float64', ''],
'int':['int8', 'int16', 'int32','int64',''],        
'unit':['unit8', 'unit16', 'unit32', 'uni64', ''],}
pd.DataFrame(data).set_index('memory usage')

Unnamed: 0_level_0,float,int,unit
memory usage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1b,,int8,unit8
2b,float16,int16,unit16
4b,float32,int32,unit32
8b,float64,int64,uni64
variable,,,


# numpy.iinfo

In [30]:
int_types = ['int8', 'int16']
for it in int_types:
    print(np.iinfo(it))

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------



# Optimizing Numeric Col: int
- pd.to_numeric: downcast='unsigned'

In [49]:
# memory check
def mem_usage(pd_obj):
    if isinstance(pd_obj, pd.DataFrame):
        usage_b = pd_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pd_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2
    return "{:03.2f} MB".format(usage_mb)


In [35]:
# integer 
m_int = m.select_dtypes(include=['int'])
converted_int = m_int.apply(pd.to_numeric, downcast='unsigned')

print('original', mem_usage(m_int))
print('converted',mem_usage(converted_int))

compare_ints = pd.concat([m_int.dtypes, converted_int.dtypes], axis=1)
compare_ints.columns = ['before', 'after']
compare_ints.apply(pd.Series.value_counts)

original 14.81 MB
converted 8.64 MB


Unnamed: 0,before,after
uint16,,1
uint32,,1
int64,3.0,1


# Optimizing Numeric Col: float
- downcast = 'float'

In [37]:
# float
m_float = m.select_dtypes(include=['float'])
converted_float = m_float.apply(pd.to_numeric, downcast='float')

print('original', mem_usage(m_float))
print('converted',mem_usage(converted_float))

compare_float = pd.concat([m_float.dtypes, converted_float.dtypes], axis=1)
compare_float.columns = ['before', 'after']
compare_float.apply(pd.Series.value_counts)


original 4.94 MB
converted 2.47 MB


Unnamed: 0,before,after
float32,,1.0
float64,1.0,


In [50]:
optimized_m = m.copy()

optimized_m[converted_int.columns] = converted_int
optimized_m[converted_float.columns] = converted_float

print(mem_usage(m))
print(mem_usage(optimized_m))

145.14 MB
136.50 MB


# about str
- the size of original strings = one in a pandas series

In [52]:
from sys import getsizeof

s1 = 'get out'
s2 = 'you get out'
s3 = 'oh my goodness!'

for s in [s1, s2, s3]:
    print(getsizeof(s))

56
60
64


In [53]:
pd.Series(['get out',
           'you get out',
           'oh my goodness!',
]).apply(getsizeof)

0    56
1    60
2    64
dtype: int64

# Optimizing Object col
- conver to category

In [56]:
m_obj = m.select_dtypes(include=['object']).copy()
m_obj.describe()

Unnamed: 0,Weekday,Upc,DepartmentDescription
count,647054,647054.0,645693
unique,7,97715.0,68
top,Sunday,4011.0,GROCERY DRY GOODS
freq,133975,7657.0,70402


In [57]:
week = m_obj.Weekday
print(week.head())

week_cat = week.astype('category')
print(week_cat.head())

0    Friday
1    Friday
2    Friday
3    Friday
4    Friday
Name: Weekday, dtype: object
0    Friday
1    Friday
2    Friday
3    Friday
4    Friday
Name: Weekday, dtype: category
Categories (7, object): [Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday]


In [58]:
week_cat.head().cat.codes

0    0
1    0
2    0
3    0
4    0
dtype: int8

In [59]:
print(mem_usage(week))
print(mem_usage(week_cat))

39.51 MB
0.62 MB


# conver object to category type

In [61]:
converted_obj = pd.DataFrame()

for col in m_obj.columns:
    num_unique_values = len(m_obj[col].unique())
    num_total_values = len(m_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:, col] = m_obj[col].astype('category')
    else:
        converted_obj.loc[:, col] = m_obj[col]

In [65]:
print(mem_usage(m_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([m_obj.dtypes, converted_obj.dtypes], axis=1)
compare_obj.columns = ['before', 'after']
compare_obj.apply(pd.Series.value_counts)

125.39 MB
12.68 MB


Unnamed: 0,before,after
object,3.0,
category,,1.0
category,,1.0
category,,1.0


# selective reading
- dtype=column_types

In [80]:
dtypes = optimized_m.dtypes
dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

preview = first2pairs = {key: val for key, val in list(column_types.items())}

import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{   'DepartmentDescription': 'object',
    'FinelineNumber': 'float32',
    'ScanCount': 'int64',
    'TripType': 'uint16',
    'Upc': 'object',
    'VisitNumber': 'uint32',
    'Weekday': 'object'}


In [83]:
read_and_optimized = pd.read_csv('train.csv', dtype=column_types)

print(mem_usage(read_and_optimized))
read_and_optimized.head()

135.16 MB


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980,1,SHOES,8931.0
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017.0
