In [1]:
import numpy as np
import pandas as pd

### Function

In [3]:
"""
Reducer could readuce your Pandas DataFrame

This process doesn't change data into a DataFrame,
it only replaces datatypes and sets necessary datatypes for each columns
"""

def mem_usage(obj):
    """
    Function chech current size of a DataFrame
    
    input: pd.DataFrame | pd.Series
    
    return: sting
    """
    #if DataFrame
    if isinstance(obj, pd.DataFrame):
        usage_b = obj.memory_usage(deep = True).sum()
    #if Series
    else:
        usage_b = obj.memory_usage(deep = True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)



def replace_cat(obj, min_value=0.10, features=None):
    """
    Function converts objects to numveric types
    (only columns which are less unique then min_value % is)
    
    input: pd.DataFrame | pd.Series
    input: hasattr(features, '__iter__')
    input: .0 =< min_value >= 1.
    
    return pd.DataFrame | pd.Series
    """
    #it's only with DataFrame
    if isinstance(obj, pd.DataFrame):
        converted_obj = pd.DataFrame()
        if not features: features = obj.columns
        #observe of columns 
        for col in features:
            num_unique_values = len(obj[col].unique())
            num_total_values = len(obj[col])
            #if % of a unique feature less the min_value
            if num_unique_values / num_total_values < min_value:
                converted_obj.loc[:,col] = obj[col].astype('category')
            else:
                converted_obj.loc[:,col] = obj[col]
        return converted_obj
    #if Series
    else:
        num_unique_values = len(obj[col].unique())
        num_total_values = len(obj[col])
        #if % of a unique feature less the min_value
        if num_unique_values / num_total_values < min_value:
            return obj[col].astype('category')
        else:
            return obj[col]

        
        
def as_dict(frame):
    """
    Function transforms the frame to the dictionary
    
    input: pd.DataFrame
    
    return dictionary
    """
    #process for transforming data into a dictionary
    dtypes = frame.dtypes
    dtypes_col = dtypes.index
    dtypes_type = [i.name for i in dtypes.values]
    column_types = dict(zip(dtypes_col, dtypes_type))
    return {key:value for key,value in list(column_types.items())}



def weight_reducer(frame, min_value=0.10, get_dict = None):
    """
    Main function reduces a data frame weight
    
    input: pd.DataFrame
    input: .0 =< min_value >= 1.
    input: get_dict == bool
    
    return pd.DataFrame  
    """
    #only if it is frame
    if isinstance(frame, pd.DataFrame):
        before = mem_usage(frame)
        for dtype in set(frame.dtypes):
            if 'float' in str(dtype):
                cols = train_df[[col for col in train_df.columns if 'float' in str(train_df[col].dtypes)]].select_dtypes(include = [dtype]).columns
                frame[cols] = frame[cols].select_dtypes(include = [dtype]). \
                              apply(pd.to_numeric, downcast='float')
            if 'int' in str(dtype): 
                cols = train_df[[col for col in train_df.columns if 'int' in str(train_df[col].dtypes)]].select_dtypes(include = [dtype]).columns
                frame[cols] = frame[cols].select_dtypes(include = [dtype]).\
                              apply(pd.to_numeric, downcast='unsigned')
            frame= replace_cat(frame, min_value)
        after = mem_usage(frame)
        print('Columns - {}, early size - {}, replaced size - {}'.format(frame.shape[1], before, after))
        if get_dict: frame = as_dict(frame)
        return frame

### Explanation

In [26]:
### Download your data set ;)

In [31]:
frame = train_df.copy()

In [36]:
#check size of the data set
frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31052 entries, 0 to 17526
Columns: 1275 entries, SK_SUBS_ID to LIFT_ARPU
dtypes: float64(1107), object(168)
memory usage: 302.3+ MB


In [37]:
#but! it't not true (302 MB, what does a plus mean?)
#go deeper
frame.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31052 entries, 0 to 17526
Columns: 1275 entries, SK_SUBS_ID to LIFT_ARPU
dtypes: float64(1107), object(168)
memory usage: 441.7 MB


In [38]:
#WOW! Difference is more the 130 MB
#let's use reducing function
reduced_frame = weight_reducer(frame)

Columns - 1275, early size - 441.70 MB, replaced size - 69.04 MB


In [39]:
#Not bad, isn't it?
#The function reduces our data set from 441 MB to 69 MB
#Surprisingly!!!

In [40]:
#Let's check where data hid
print(mem_usage(frame))
print(mem_usage(reduced_frame))
comparator = pd.concat([frame.dtypes, reduced_frame.dtypes],axis=1)
comparator.columns = ['before','after']
comparator.apply(pd.Series.value_counts)

313.98 MB
69.04 MB


Unnamed: 0,before,after
float32,1107.0,152
object,168.0,2
category,,1121


In [46]:
#unfortunately, it takes time ;(
#How can we struggle with this probleb?
#It would be better, if we was able to download reduced data
#There is the way for downloading reduced data -> enable get_dict = True
reduced_dict = weight_reducer(frame, get_dict = True)
{k:reduced_dict[k] for k in frame.columns[:10]}

Columns - 1275, early size - 313.98 MB, replaced size - 69.04 MB


{'DATE_KEY': 'category',
 'END_DATE': 'category',
 'END_WEEK_DATE': 'category',
 'FILIAL_ID': 'category',
 'SK_SUBS_ID': 'object',
 'START_DATE': 'category',
 'START_WEEK_DATE': 'category',
 'TM_NONE_ALLOT': 'category',
 'TM_NONE_CNT_3MNTH': 'category',
 'TM_NONE_CNT_6MNTH': 'category'}

In [None]:
#reduced_dict includes a dictionary, we use the dictionary during downloading a data set
#due to DTYPE parametr we could download
pd.read_csv(link, sep='\t', encoding='utf-8', dtype=reduced_dict)