In [1]:
import os
import json
import pandas as pd
import numpy as np

os.chdir('../../data/')


# https://www.kaggle.com/code/gemartin/load-data-reduce-memory-usage -- slightly modified version
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type != 'datetime64[ns]':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif col_type != 'datetime64[ns]':
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Load data

In [2]:
transactions_df = pd.read_csv('transactions_train.csv')
customers_df = pd.read_csv('customers.csv')
articles_df = pd.read_csv('articles.csv')
submissions_df = pd.read_csv('sample_submission.csv')

## Unique customers

In this part, unique customer_id's are identified across three files in which they occur. Based on that an index is created, which is consecutive integers for every id, starting at 0.
Thanks to this we can reduce memory footprint of files drastically (which is seen later). Index is saved later to ensure consistency.

In [3]:
trans_cust = transactions_df.customer_id.unique()
cust_cust = customers_df.customer_id.unique()
subm_cust = submissions_df.customer_id.unique()

all_unique_customers = np.unique(np.concatenate((trans_cust, cust_cust, subm_cust)))

print('Total of unique customers: ', len(all_unique_customers))

cust_id_to_idx = {cust_id: idx for idx, cust_id in enumerate(all_unique_customers)}

del submissions_df

Total of unique customers:  1371980


## Transactions train reduction

Reduction of file <b>transactions_train.csv</b>. 

DataFrame size reduced from <b>6426 MB </b> to <b>576 MB</b>.

In [4]:
display(transactions_df.info(memory_usage='deep'))
display(transactions_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 6.3 GB


None

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
start_memory = transactions_df.memory_usage(deep=True).sum()
current_memory = start_memory

print('Starting memory : ', start_memory // 1024**2, 'MB')

transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
current_memory = transactions_df.memory_usage(deep=True).sum()

print('\nMemory after datetime coversion: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

transactions_df['customer_id'] = transactions_df['customer_id'].apply(lambda x: cust_id_to_idx[x])
current_memory = transactions_df.memory_usage(deep=True).sum()

print('\nMemory after customer_id mapping: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

print('\nRunning reduce_mem_usage function...\n')
transactions_df = reduce_mem_usage(transactions_df)
current_memory = transactions_df.memory_usage(deep=True).sum()

print('\nMemory after reduce_mem_usage function: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

Starting memory :  6426 MB

Memory after datetime coversion:  4638 MB . Total memory reduction:  1788 MB (27.83%)

Memory after customer_id mapping:  1212 MB . Total memory reduction:  5214 MB (81.13%)

Running reduce_mem_usage function...

Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 576.00 MB
Decreased by 52.5%

Memory after reduce_mem_usage function:  575 MB . Total memory reduction:  5850 MB (91.04%)


In [7]:
display(transactions_df.info(memory_usage='deep'))
display(transactions_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       int32         
 2   article_id        int32         
 3   price             float16       
 4   sales_channel_id  int8          
dtypes: datetime64[ns](1), float16(1), int32(2), int8(1)
memory usage: 576.0 MB


None

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,2,663713001,0.050842,2
1,2018-09-20,2,541518023,0.030487,2
2,2018-09-20,7,505221004,0.015236,2
3,2018-09-20,7,685687003,0.016937,2
4,2018-09-20,7,685687004,0.016937,2


## Customers file reduction

Reduction of file <b>customers.csv</b>. 

DataFrame size reduced from <b>512 MB </b> to <b>20 MB</b>. 

In [8]:
# Same as for customer_id, but here we only have one file contatining postal codes
postal_code_to_idx = {postal_code: idx for idx, postal_code in enumerate(customers_df.postal_code.unique())}

In [9]:
display(customers_df.info(memory_usage='deep'))
display(customers_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355969 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 512.3 MB


None

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [10]:
start_memory = customers_df.memory_usage(deep=True).sum()
current_memory = start_memory

print('Starting memory : ', start_memory // 1024**2, 'MB')

customers_df['customer_id'] = customers_df['customer_id'].apply(lambda x: cust_id_to_idx[x])
current_memory = customers_df.memory_usage(deep=True).sum()

print('\nMemory after customer_id mapping: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

customers_df['postal_code'] = customers_df['postal_code'].apply(lambda x: postal_code_to_idx[x])
current_memory = customers_df.memory_usage(deep=True).sum()

print('\nMemory after postal_code mapping: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

print('\nRunning reduce_mem_usage function...\n')
customers_df = reduce_mem_usage(customers_df)
current_memory = customers_df.memory_usage(deep=True).sum()

print('\nMemory after reduce_mem_usage function: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

Starting memory :  512 MB

Memory after customer_id mapping:  364 MB . Total memory reduction:  147 MB (28.86%)

Memory after postal_code mapping:  216 MB . Total memory reduction:  295 MB (57.72%)

Running reduce_mem_usage function...

Memory usage of dataframe is 216.59 MB
Memory usage after optimization is: 20.94 MB
Decreased by 90.3%

Memory after reduce_mem_usage function:  20 MB . Total memory reduction:  491 MB (95.91%)


In [11]:
display(customers_df.info(memory_usage='deep'))
display(customers_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype   
---  ------                  --------------    -----   
 0   customer_id             1371980 non-null  int32   
 1   FN                      476930 non-null   float16 
 2   Active                  464404 non-null   float16 
 3   club_member_status      1365918 non-null  category
 4   fashion_news_frequency  1355969 non-null  category
 5   age                     1356119 non-null  float16 
 6   postal_code             1371980 non-null  int32   
dtypes: category(2), float16(3), int32(2)
memory usage: 20.9 MB


None

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,,,ACTIVE,NONE,49.0,0
1,1,,,ACTIVE,NONE,25.0,1
2,2,,,ACTIVE,NONE,24.0,2
3,3,,,ACTIVE,NONE,54.0,3
4,4,1.0,1.0,ACTIVE,Regularly,52.0,4


## Articles file reduction

Reduction of file <b>articles.csv</b>. 

DataFrame size reduced from <b>117 MB </b> to <b>19 MB</b>. Further reduction possible by eliminating duplicate (integer vs object) columns.

In [12]:
display(articles_df.info(memory_usage='deep'))
display(articles_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

None

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [13]:
start_memory = articles_df.memory_usage(deep=True).sum()
current_memory = start_memory

print('Starting memory : ', start_memory // 1024**2, 'MB')

articles_df = reduce_mem_usage(articles_df)
current_memory = articles_df.memory_usage(deep=True).sum()

print('\nMemory after reduce_mem_usage function: ', current_memory // 1024**2, 'MB', '. Total memory reduction: ', (start_memory - current_memory) // 1024**2, 'MB', f'({(start_memory - current_memory) / start_memory * 100:.2f}%)')

Starting memory :  117 MB
Memory usage of dataframe is 117.58 MB
Memory usage after optimization is: 18.98 MB
Decreased by 83.9%

Memory after reduce_mem_usage function:  18 MB . Total memory reduction:  98 MB (83.86%)


In [14]:
display(articles_df.info(memory_usage='deep'))
display(articles_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype   
---  ------                        --------------   -----   
 0   article_id                    105542 non-null  int32   
 1   product_code                  105542 non-null  int32   
 2   prod_name                     105542 non-null  category
 3   product_type_no               105542 non-null  int16   
 4   product_type_name             105542 non-null  category
 5   product_group_name            105542 non-null  category
 6   graphical_appearance_no       105542 non-null  int32   
 7   graphical_appearance_name     105542 non-null  category
 8   colour_group_code             105542 non-null  int8    
 9   colour_group_name             105542 non-null  category
 10  perceived_colour_value_id     105542 non-null  int8    
 11  perceived_colour_value_name   105542 non-null  category
 12  perceived_colour_master_id    

None

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


# Save files

 - Save compressed files to .pkl objects, to enable easy loading in later phases of project. 
 - Save index dictionaries for submission purposes.

In [15]:
if not os.path.exists('compressed_data'):
    os.mkdir('compressed_data')

transactions_df.to_pickle('compressed_data/transactions_train.pkl')
customers_df.to_pickle('compressed_data/customers.pkl')
articles_df.to_pickle('compressed_data/articles.pkl')

if not os.path.exists('mappings'):
    os.mkdir('mappings')

with open('mappings/cust_id_to_idx.json', 'w') as f:
    json.dump(cust_id_to_idx, f)

with open('mappings/postal_code_to_idx.json', 'w') as f:
    json.dump(postal_code_to_idx, f)

print('\nDone!')


Done!
