In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc # garbage collector
import psutil

# write classes, test cases, use __name__ == "__main__", args, kwargs
# write bash scripts

In [None]:
"""
Tips for Reducing Memory:

no longer using the variable:
    del temp
    
remove garbage after a transformation:
    gc.collect()
    
presetting datatypes:
    dtypes = {
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint16',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'is_attributed' : 'uint8',
            }

    train = pd.read_csv('../input/train_sample.csv', dtype=dtypes)
    
import select rows:
    train = pd.read_csv('../input/train.csv', nrows=10000, dtype=dtypes)
    
import select columns:
    columns = ['ip', 'click_time', 'is_attributed']
    dtypes = {
            'ip'            : 'uint32',
            'is_attributed' : 'uint8',
            }

    ips_df = pd.read_csv('../input/train.csv', usecols=columns, dtype=dtypes)
"""

In [10]:
%%time
train_X = pd.read_csv('data/train.csv')
train_y = pd.read_csv('data/train_labels.csv')

CPU times: user 517 ms, sys: 57.1 ms, total: 574 ms
Wall time: 581 ms


In [36]:
train_X.dtypes.values

array([dtype('int32'), dtype('float64'), dtype('<M8[ns]'), category,
       dtype('int16'), category, dtype('float64'), dtype('float64'),
       dtype('O'), dtype('int16'), category, category, category,
       dtype('int8'), dtype('int8'), category, category, dtype('int16'),
       category, category, category, category, category, dtype('int16'),
       category, category, category, category, category, category,
       category, category, category, category, category, category,
       category, category, category, category], dtype=object)

In [None]:
train_X.info(memory_usage='deep')

In [44]:
train_X.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [None]:
print("size before:", train_X["date_recorded"].memory_usage(deep=True) * 1e-6)
train_X["date_recorded"] = pd.to_datetime(train_X["date_recorded"])
print("size after: ", train_X["date_recorded"].memory_usage(deep=True) * 1e-6)

In [11]:
def convert_obj_columns_to_cat(df, exclude_cols):
    """
    Convert the datatype of object columns to category columns.

    :param df: Dataframe of the data
    :type  df: pandas.core.frame.DataFrame
    :param exclude_cols: set of columns to exclude from conversion
    :type  exclude_cols: set
    :returns: dataframe
    :rtype:   pandas.core.frame.DataFrame
    """

    column_list = df.select_dtypes(include=['object']).columns
    column_list = [col for col in column_list if col not in exclude_cols]
    for col in column_list:
        print("converting", col.ljust(30),
              "size: ", round(df[col].memory_usage(deep=True)*1e-6, 2), end="\t")
        df[col] = df[col].astype("category")
        print("->\t", round(df[col].memory_usage(deep=True)*1e-6, 2))
    return df

#train_X = convert_obj_columns_to_cat(train_X, {'wpt_name'})

In [None]:
# to manage objects in memory and specifically removing unneeded objects
# After doing larger transformations, object creations/deletions or generally anything else that runs 
# for more than a few seconds,, it can help to free up memory by calling the garbage collector directly.

In [None]:
print("available RAM:", psutil.virtual_memory())
gc.collect()
print("available RAM:", psutil.virtual_memory())

In [12]:
def downcast_df_int_columns(df):
    """
    Change integer types to decrease memory usage.

    :param df: Dataframe of the data
    :type  df: pandas.core.frame.DataFrame
    :returns: dataframe
    :rtype:   pandas.core.frame.DataFrame
    """

    list_of_columns = list(df.select_dtypes(
        include=["int32", "int64"]).columns)

    if len(list_of_columns) >= 1:
        # finds max string length for better status printing
        max_string_length = max([len(col) for col in list_of_columns])
        print("\ndowncasting integers for:", list_of_columns, "\n")

        for col in list_of_columns:
            print("reduced memory usage for:  ", col.ljust(max_string_length+2)[:max_string_length+2],
                  "from", str(round(df[col].memory_usage(deep=True)*1e-6, 2)).rjust(8), "to", end=" ")
            df[col] = pd.to_numeric(df[col], downcast="integer")
            print(str(round(df[col].memory_usage(deep=True)*1e-6, 2)).rjust(8))
    else:
        print("no columns to downcast")

    gc.collect()

    return df

#train_X = downcast_df_int_columns(train_X)

In [None]:
train_X.info(memory_usage='deep')

In [None]:
#train_X['wpt_name'] = train_X['wpt_name'].apply(lambda x: int(x,16))
#train_X.memory_usage(deep=True)*1e-6

In [13]:
def compress_labels(df):
    df['status_group'] = df['status_group'].astype("category")
    df['id'] = pd.to_numeric(df['id'], downcast="integer")
    return df

def compress_X(df):
    
    memory = df.memory_usage(index=True).sum()*1e-6
    
    print ("memory used before preprocess: ", memory)
    # Compress date time
    print("\ndate time size before:", df["date_recorded"].memory_usage(deep=True) * 1e-6)
    df["date_recorded"] = pd.to_datetime(df["date_recorded"])
    print("date time size after: ", df["date_recorded"].memory_usage(deep=True) * 1e-6, '\n')
    
    # convert and compress objects to categories
    df = convert_obj_columns_to_cat(df, {'wpt_name'})
    
    # compress integer values
    df = downcast_df_int_columns(df)
    
    print("\navailable RAM:", psutil.virtual_memory())
    gc.collect()
    print("available RAM:", psutil.virtual_memory())
    
    memory = df.memory_usage(index=True).sum()*1e-6
    print ("\nmemory used after preprocess: ", memory)
    
    return df
    
    
train_X = compress_X(train_X)

memory used before preprocess:  19.00808

date time size before: 3.9798799999999996
date time size after:  0.47528 

converting funder                         size:  3.86	->	 0.33
converting installer                      size:  3.64	->	 0.34
converting basin                          size:  4.03	->	 0.06
converting subvillage                     size:  3.85	->	 2.03
converting region                         size:  3.78	->	 0.06
converting lga                            size:  3.83	->	 0.07
converting ward                           size:  3.83	->	 0.34
converting public_meeting                 size:  2.1	->	 0.06
converting recorded_by                    size:  4.75	->	 0.06
converting scheme_management              size:  3.55	->	 0.06
converting scheme_name                    size:  3.15	->	 0.4
converting permit                         size:  2.06	->	 0.06
converting extraction_type                size:  3.84	->	 0.06
converting extraction_type_group          size:  3.85	->	 0.06
con