In [1]:
#General imports
import pandas as pd
import numpy as np
import scipy
import re

import os, sys, gc, time, warnings, pickle, psutil, random

from sklearn import preprocessing, metrics
import gc

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [2]:
TARGET = 'sales'         # Our main target
END_TRAIN = 1913         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [4]:
train = pd.read_csv('sales_train_validation.csv')

In [4]:
train_ca = train[train['state_id']=='CA']#break up by state in order to compute faster
train_tx = train[train['state_id']=='TX']
train_wi = train[train['state_id']=='WI']

In [5]:
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_ca = pd.melt(train_ca, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)
train_tx = pd.melt(train_tx, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)
train_wi = pd.melt(train_wi, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

Let's create some temporary df's for the purposes of engineering lag features.

In [6]:
'''Given our size of our data, it might be wise to record how long each function takes so we know where
our bottlenecks are.'''

from functools import wraps

def timer(func):
    """A decorator that prints how long a function took to run."""

    # Define the wrapper function to return.
    @wraps #preserve the metadata of our function.
    def wrapper(*args, **kwargs):
        # When wrapper() is called, get the current time.
        t_start = time.time()

        # Call the decorated function and store the result.
        result = func(*args, **kwargs)

        # Get the total time it took to run, and print it.
        t_total = time.time() - t_start

        print('{} took {}s'.format(func.__name__, t_total))        
        return result
    return wrapper

In [8]:
'''Since our data is already sorted by 'd' values, we can easily shift() values
as we as aggregate values.'''

lags = [col for col in range(15,36,2)]

start_time = time.time()
ca_shifts = train_ca[['id','d',TARGET]]
ca_shifts = ca_shifts.assign(**{
        '{}_lag_{}'.format(col, l): ca_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

2.62 min: Time for bulk shift


In [9]:
tx_shifts = train_tx[['id','d',TARGET]]
tx_shifts = tx_shifts.assign(**{
        '{}_lag_{}'.format(col, l): tx_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

In [7]:
wi_shifts = train_wi[['id','d',TARGET]]
wi_shifts = wi_shifts.assign(**{
        '{}_lag_{}'.format(col, l): wi_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

Now that we generated lag features, we can focus on creating smoothing/rolling features. Afterwards, we can focus on dealing with the NaNs created from these features.

In [10]:
'''Rolling averages with different time frames'''

ca_rolls = train_ca[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    ca_rolls['rolling_mean_'+str(i)] = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    ca_rolls['rolling_std_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    ca_rolls['rolling_max_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    ca_rolls['rolling_min_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


In [6]:
'''Rolling averages with different time frames'''

tx_rolls = train_tx[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    tx_rolls['rolling_mean_'+str(i)] = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    tx_rolls['rolling_std_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    tx_rolls['rolling_max_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    tx_rolls['rolling_min_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


In [6]:
'''Rolling averages with different time frames'''

wi_rolls = train_wi[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    wi_rolls['rolling_mean_'+str(i)] = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    wi_rolls['rolling_std_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    wi_rolls['rolling_max_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    wi_rolls['rolling_min_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


Now that we've created lagged and smoothing features, to finish our feature engineering we can create features based on day - whether its a holiday, weekend, etc.

Later, we will label encode these.

In [2]:
cal = pd.read_csv('calendar.csv', parse_dates = ['date'])

In [5]:
event_list=[i for i in cal.event_name_1.fillna(0).unique() if i != 0] 

#Extract all the days an event has in the span of 1916 days
day_event_list=[cal[cal.event_name_1==i].d.tolist() for i in event_list]

#Create the Event_df dataframe which we will use throughout the notebook
event_df=pd.DataFrame({'Event Name' : event_list, 'Event day':day_event_list})
restricted_day= set(['d_'+ str(i) for i in np.arange(1916,1970)])
quantity=[]

for i in day_event_list:
    # Making sure that we exclude all the days thats are not in the training set
    clean_i=list(set(i)-restricted_day)
    temp=train[clean_i].sum().sum() #Adding columns and then rows
    quantity.append(temp)

event_df['Quantity']=quantity

all_events = event_df['Event day'].values
all_events = np.concatenate(all_events, axis=0)
all_events = all_events.astype(str)

In [6]:
all_events

array(['d_9', 'd_373', 'd_737', 'd_1101', 'd_1465', 'd_1836', 'd_17',
       'd_382', 'd_748', 'd_1113', 'd_1478', 'd_1843', 'd_24', 'd_388',
       'd_752', 'd_1116', 'd_1480', 'd_1844', 'd_40', 'd_390', 'd_747',
       'd_1132', 'd_1482', 'd_1839', 'd_47', 'd_397', 'd_754', 'd_1139',
       'd_1489', 'd_1846', 'd_48', 'd_414', 'd_779', 'd_1144', 'd_1509',
       'd_1875', 'd_51', 'd_405', 'd_758', 'd_1143', 'd_1497', 'd_1882',
       'd_86', 'd_443', 'd_828', 'd_1535', 'd_1920', 'd_88', 'd_442',
       'd_795', 'd_1180', 'd_1534', 'd_1919', 'd_97', 'd_463', 'd_1193',
       'd_1558', 'd_1924', 'd_100', 'd_471', 'd_835', 'd_1199', 'd_1563',
       'd_1927', 'd_122', 'd_486', 'd_850', 'd_1214', 'd_1578', 'd_1949',
       'd_123', 'd_501', 'd_860', 'd_1224', 'd_1588', 'd_1952', 'd_135',
       'd_510', 'd_874', 'd_1234', 'd_1600', 'd_1969', 'd_142', 'd_506',
       'd_870', 'd_1605', 'd_157', 'd_523', 'd_888', 'd_1253', 'd_1618',
       'd_185', 'd_539', 'd_893', 'd_1248', 'd_1602', 'd_

In [20]:
ca['event'] = ca['d'].apply(lambda x: 1 if x in all_events else 0)
tx['event'] = tx['d'].apply(lambda x: 1 if x in all_events else 0)
wi['event'] = wi['d'].apply(lambda x: 1 if x in all_events else 0)

In [11]:
ca.head()

Unnamed: 0,id,d,bollinger,rrg_bench,rrg_item,ATR,ann_vol,entropy,beta,info_ratio,triple_exp,relative_vol,RS,gap_size,gap_cat,gap_start,number_of_gaps,rolling_max_14,rolling_max_180,rolling_max_30,rolling_max_60,rolling_max_90,rolling_mean_14,rolling_mean_180,rolling_mean_30,rolling_mean_60,rolling_mean_90,rolling_min_14,rolling_min_180,rolling_min_30,rolling_min_60,rolling_min_90,rolling_std_14,rolling_std_180,rolling_std_30,rolling_std_60,rolling_std_90,sales_lag_15,sales_lag_17,sales_lag_19,sales_lag_21,sales_lag_23,sales_lag_25,sales_lag_27,sales_lag_29,sales_lag_31,sales_lag_33,sales_lag_35,event
1,HOBBIES_1_001_CA_1_validation,d_1201,0.0,5.0,5.0,3.0,18.234375,1.646484,0.0,0.010735,0.0,0.0,0.0,921.0,beginning,4.0,1.0,1.0,3.0,2.0,2.0,2.0,0.214233,0.427734,0.300049,0.350098,0.366699,0.0,0.0,0.0,0.0,0.0,0.425781,0.660156,0.535156,0.546875,0.589355,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,HOBBIES_1_002_CA_1_validation,d_1201,0.0,5.0,5.0,3.0,18.234375,1.111328,0.0,-0.001097,0.0,0.0,0.0,921.0,beginning,4.0,1.0,2.0,2.0,2.0,2.0,2.0,0.214233,0.166626,0.266602,0.199951,0.166626,0.0,0.0,0.0,0.0,0.0,0.579102,0.41626,0.520996,0.443359,0.403564,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
5,HOBBIES_1_003_CA_1_validation,d_1201,0.0,5.0,5.0,3.0,18.234375,1.176758,0.0,-0.000176,0.0,0.0,0.0,921.0,beginning,4.0,1.0,2.0,3.0,2.0,3.0,3.0,0.285645,0.194458,0.233276,0.350098,0.333252,0.0,0.0,0.0,0.0,0.0,0.611328,0.529785,0.503906,0.65918,0.618164,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
7,HOBBIES_1_004_CA_1_validation,d_1201,2.0,5.0,5.0,3.0,18.234375,2.8125,0.0,0.041687,2.0,0.0,0.0,921.0,beginning,4.0,1.0,14.0,14.0,14.0,14.0,14.0,2.5,2.339844,2.267578,2.349609,2.388672,0.0,0.0,0.0,0.0,0.0,3.414062,2.417969,2.449219,2.433594,2.607422,1.0,1.0,2.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,0.0,0
9,HOBBIES_1_005_CA_1_validation,d_1201,2.0,5.0,5.0,3.0,18.234375,2.099609,0.0,0.023849,2.0,0.0,0.0,921.0,beginning,4.0,1.0,3.0,5.0,3.0,3.0,3.0,0.643066,0.833496,0.799805,0.799805,0.799805,0.0,0.0,0.0,0.0,0.0,0.928711,1.027344,0.924805,0.935059,0.901855,2.0,0.0,1.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,1.0,0


In [21]:
#changing our categoricals to numeric as we prepare for modeling.
ca.loc[ca['gap_cat']=='beginning', 'gap_placement'] = 0
ca.loc[ca['gap_cat']=='middle', 'gap_placement'] = 1
ca.loc[ca['gap_cat']=='end', 'gap_placement'] = 2


tx.loc[tx['gap_cat']=='beginning', 'gap_placement'] = 0
tx.loc[tx['gap_cat']=='middle', 'gap_placement'] = 1
tx.loc[tx['gap_cat']=='end', 'gap_placement'] = 2

wi.loc[wi['gap_cat']=='beginning', 'gap_placement'] = 0
wi.loc[wi['gap_cat']=='middle', 'gap_placement'] = 1
wi.loc[wi['gap_cat']=='end', 'gap_placement'] = 2

In [22]:
ca.drop('gap_cat', axis=1, inplace=True)
tx.drop('gap_cat', axis=1, inplace=True)
wi.drop('gap_cat', axis=1, inplace=True)

In [24]:
ca.to_pickle('ca_final.pkl')
tx.to_pickle('tx_final.pkl')
wi.to_pickle('wi_final.pkl')

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
   
    return df



sell_prices_df = pd.read_csv('sell_prices.csv')
sell_prices_df = reduce_mem_usage(sell_prices_df)
print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

calendar_df = pd.read_csv('calendar.csv')
calendar_df = reduce_mem_usage(calendar_df)
print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))

sales_train_validation_df = pd.read_csv('sales_train_validation.csv')
sales_train_validation_df = reduce_mem_usage(sales_train_validation_df)
print('Sales train validation has {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))

submission_df = pd.read_csv('sample_submission.csv')

Memory usage of dataframe is 208.77 MB
Memory usage after optimization is: 45.77 MB
Decreased by 78.1%
Sell prices has 6841121 rows and 4 columns
Memory usage of dataframe is 0.21 MB
Memory usage after optimization is: 0.22 MB
Decreased by -5.7%
Calendar has 1969 rows and 14 columns
Memory usage of dataframe is 446.40 MB
Memory usage after optimization is: 95.42 MB
Decreased by 78.6%
Sales train validation has 30490 rows and 1919 columns


In [3]:
NUM_ITEMS = sales_train_validation_df.shape[0]  # 30490
DAYS_PRED = 28
nrows = 365 * 2 * NUM_ITEMS

gc.collect()

20

In [4]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df


calendar_df = encode_categorical(calendar_df, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)
sales_train_validation_df = encode_categorical(sales_train_validation_df, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
sell_prices_df = encode_categorical(sell_prices_df, ["item_id", "store_id"]).pipe(reduce_mem_usage)    
    
gc.collect()

Memory usage of dataframe is 0.27 MB
Memory usage after optimization is: 0.23 MB
Decreased by 16.5%
Memory usage of dataframe is 95.72 MB
Memory usage after optimization is: 95.32 MB
Decreased by 0.4%
Memory usage of dataframe is 78.29 MB
Memory usage after optimization is: 45.67 MB
Decreased by 41.7%


77

In [5]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    sales_train_validation = sales_train_validation.iloc[-nrows:,:]  
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    print(data.shape)
    
    # get only a sample for fst training
#     data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data 

In [6]:
nrows = 27500000
data = melt_and_merge(calendar_df, sell_prices_df, sales_train_validation_df, submission_df, nrows = nrows, merge = True)    
data = data[5800861:]

Melted sales train validation has 58327370 rows and 8 columns
Memory usage of dataframe is 1002.74 MB
Memory usage after optimization is: 669.08 MB
Decreased by 33.3%
(29207440, 9)
Our final dataset to train has 28353720 rows and 19 columns


In [7]:
def transform(data):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

def simple_fe(data):
    
    # rolling demand features
    
    for val in [28, 29, 30]:
        data[f"shift_t{val}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(val))
    for val in [7, 30, 60, 90, 180]:
        data[f"rolling_std_t{val}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(val).std())
    for val in [7, 30, 60, 90, 180]:
        data[f"rolling_mean_t{val}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(val).mean())

    data["rolling_skew_t30"] = data.groupby(["id"])["demand"].transform( lambda x: x.shift(28).rolling(30).skew())
    data["rolling_kurt_t30"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(30).kurt())
    
        
    # time features
    data['date'] = pd.to_datetime(data['date'])
    attrs = ["year", "quarter", "month", "week", "day", "dayofweek", "is_year_end", "is_year_start", "is_quarter_end", \
        "is_quarter_start", "is_month_end","is_month_start",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        data[attr] = getattr(data['date'].dt, attr).astype(dtype)
    data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
    
    gc.collect()
    
    return data    

In [8]:
data = simple_fe(data)

In [11]:
cols = ['id', 'd', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'demand',
       'part', 'date', 'wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'rolling_skew_t30', 'rolling_kurt_t30', 'year', 'quarter',
       'month', 'week', 'dayofweek', 'is_year_end', 'is_year_start',
       'is_quarter_end', 'is_quarter_start', 'is_month_end', 'is_month_start',
       'is_weekend']

data = data[cols]

In [2]:
train = pd.read_pickle('final.pkl')

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
   
    return df

train = reduce_mem_usage(train)

Memory usage of dataframe is 3947.36 MB
Memory usage after optimization is: 3199.88 MB
Decreased by 18.9%


In [4]:
train['rolling_skew_t30'].fillna(0, inplace=True)
train['rolling_kurt_t30'].fillna(0, inplace=True)

In [5]:
train.to_pickle('train.pkl')