In [88]:
import os
import pandas as pd
import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import tqdm
pd.set_option('max_columns', None)

In [2]:
input_dir = os.path.join('..', 'input')
file_csv = [file for file in os.listdir(input_dir)\
            if file.endswith('.csv')]
print(file_csv)

['historical_transactions.csv', 'merchants.csv', 'new_merchant_transactions.csv', 'sample_submission.csv', 'train.csv', 'test.csv']


In [3]:
## Pandas@reduce dataframe memory
def reduce_mem_usage(props, fillna=False, fillval=None):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ", col)
            print("dtype before: ", props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                if not fillna:
                    print('[*]skip {} cause of NA value'.format(col))
                    continue
                NAlist.append(col)
                props[col].fillna(fillval,inplace=True)  
            
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < pow(2, 8)-1:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < pow(2, 16)-1:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < pow(2, 32)-1:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [4]:
trans_df = pd.read_csv(os.path.join(input_dir, 'historical_transactions.csv'))
new_trans_df = pd.read_csv(os.path.join(input_dir, 'new_merchant_transactions.csv'))

trans_df, _ = reduce_mem_usage(trans_df, fillna=False, fillval=-999)
new_trans_df, _ = reduce_mem_usage(new_trans_df, fillna=False, fillval=-999)

Memory usage of properties dataframe is : 3109.5357055664062  MB
******************************
Column:  city_id
dtype before:  int64
dtype after:  int16
******************************
******************************
Column:  installments
dtype before:  int64
dtype after:  int16
******************************
******************************
Column:  merchant_category_id
dtype before:  int64
dtype after:  int16
******************************
******************************
Column:  month_lag
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  purchase_amount
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  category_2
dtype before:  float64
[*]skip category_2 cause of NA value
******************************
Column:  state_id
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  subsector_id
dtype before:  int64

## Subject
<br/>
Let's calculate 'basement purchasement date' used for month_lag and purchase_date features<br/>
basement_date = purchase_date - month_lag<br/>
<br/>

In [5]:
def add_year_month(df):
    df['purchase_year'] = df['purchase_date'].apply(lambda x: x[0:4]).astype(int)
    df['purchase_month'] = df['purchase_date'].apply(lambda x: x[5:7]).astype(int)

In [6]:
def change_lag_type(df):
    df['month_lag'] = df['month_lag'].astype(int)

In [7]:
add_year_month(trans_df)
add_year_month(new_trans_df)
print('done')

done


In [8]:
change_lag_type(trans_df)
change_lag_type(new_trans_df)
print('done')

done


In [36]:
def get_basement_date(df):
    df['base_purchase_month'] = df['purchase_month'] - df['month_lag']
    df['base_purchase_year'] = df['purchase_year']
    over_month_idx = df[df['base_purchase_month'] > 12].index.values
    df.loc[over_month_idx,'base_purchase_month'] = df.loc[over_month_idx, 'base_purchase_month'] - 12
    df.loc[over_month_idx,'base_purchase_year'] = df.loc[over_month_idx, 'purchase_year'] + 1
    under_month_idx = df[df['base_purchase_month'] < 1].index.values
    df.loc[under_month_idx,'base_purchase_month'] = df.loc[under_month_idx, 'base_purchase_month'] + 12
    df.loc[under_month_idx,'base_purchase_year'] = df.loc[under_month_idx, 'purchase_year'] - 1

In [37]:
get_basement_date(trans_df)
get_basement_date(new_trans_df)
print('done')

done


In [94]:
common_id = list(set.intersection(set(trans_df.card_id), set(new_trans_df.card_id)))
print('[*]len(common_id) : ', len(common_id))

[*]len(common_id) :  290001


In [92]:
trans_df[trans_df.card_id == common_id[0]][['base_purchase_year', 'base_purchase_month']].drop_duplicates()

Unnamed: 0,base_purchase_year,base_purchase_month
25317594,2017,10


In [93]:
new_trans_df[new_trans_df.card_id == common_id[0]][['base_purchase_year', 'base_purchase_month']].drop_duplicates()

Unnamed: 0,base_purchase_year,base_purchase_month
160969,2017,10


<br/>
I think base date is same between historical_transactions and new_merchant_transactions<br/>
<br/>