In [1]:
import numpy as np 
import pandas as pd 

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

PATH = "/kaggle/input/m5-forecasting-accuracy/"
print(os.listdir(PATH))

/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
['sell_prices.csv', 'sample_submission.csv', 'calendar.csv', 'sales_train_validation.csv']


In [2]:
import os
import gc
import warnings
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
register_matplotlib_converters()
sns.set()

In [3]:
for f in os.listdir(PATH):
    with open(f'{PATH}/{f}') as file:
        print("File Name {} => Rows {}".format(f,len(file.readlines())))

File Name sell_prices.csv => Rows 6841122
File Name sample_submission.csv => Rows 60981
File Name calendar.csv => Rows 1970
File Name sales_train_validation.csv => Rows 30491


In [4]:
def downcast_data(df):
    
    #getting columns names with int and float dtypes
    float_cols = df.select_dtypes(include=['float']).columns
    int_cols = df.select_dtypes(include=['int']).columns
    
    #donwcasting the values
    for col in float_cols:
        df[col] = pd.to_numeric(df[col],downcast='float')
    for col in int_cols:
        df[col] = pd.to_numeric(df[col],downcast='integer')
    return df

In [5]:

def loading_and_downcasting():
    calendar = pd.read_csv(f"{PATH}/calendar.csv").pipe(downcast_data)
    sell_price = pd.read_csv(f"{PATH}/sell_prices.csv").pipe(downcast_data)
    sales = pd.read_csv(f"{PATH}/sales_train_validation.csv").pipe(downcast_data)
    submission = pd.read_csv(f"{PATH}/sample_submission.csv").pipe(downcast_data)
    print(calendar.shape)
    print(sell_price.shape)
    print(sales.shape)
    print(submission.shape)
    
    return calendar, sell_price, sales, submission

In [6]:
calendar, sell_price, sales, submission = loading_and_downcasting()

(1969, 14)
(6841121, 4)
(30490, 1919)
(60980, 29)


In [7]:
def label_encoder(df,columns):
    
    for col in columns:
        l = LabelEncoder()
        #ignore null values
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(l.fit_transform(not_null),index=not_null.index)
        
    return df

In [8]:
calendar = label_encoder(
    calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(downcast_data)

sales = label_encoder(
    sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(downcast_data)

sell_price = label_encoder(sell_price, ["item_id", "store_id"]).pipe(downcast_data)

In [9]:
def extract_num(ser):
    return ser.str.extract(r"(\d+)").astype(np.int16)

In [11]:
id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
    
temp = sales[id_columns]
validation = submission[submission["id"].str.endswith("validation")]
validation.columns = ["id"] + [f"d_{x}" for x in range(1914,1914+28)]
validation = validation.merge(temp,how="left",on="id")
validation = validation.melt(id_vars = id_columns, var_name='d',value_name= 'demand')

sales = sales.melt(id_vars=id_columns,var_name='d',value_name='demand')

data = pd.concat([sales, validation],axis = 0)


data["d"] = extract_num(data["d"])
del sales
gc.collect()

12

In [12]:
def merge_calendar(calendar,data):
    calendar.drop(["weekday", "wday", "month", "year"], axis=1,inplace=True)
    return data.merge(calendar,how="left",on='d')

def merge_sell_price(sell_price,data):
    return data.merge(sell_price, how="left", on=["store_id", "item_id", "wm_yr_wk"])

In [13]:
calendar["d"] = extract_num(calendar["d"])
data =  merge_calendar(calendar,data)
del calendar
gc.collect()

20

In [14]:
data = downcast_data(data)

In [15]:
data.drop(['event_name_1','event_type_1','event_name_2','event_type_2'],axis=1,inplace=True)

In [16]:
data.isnull().sum()

id          0
item_id     0
dept_id     0
cat_id      0
store_id    0
state_id    0
d           0
demand      0
date        0
wm_yr_wk    0
snap_CA     0
snap_TX     0
snap_WI     0
dtype: int64

# Fitting RandomForest On Whole Data

## Functions for Preparing data

In [27]:
def timeseries_to_supervised(data,n_in=28,n_out=28,dropnan=True,input_list=list(),predict_list=list()):
    #getting of columns
    columns = list()
    columns_name = list(data.columns)
    
    #data to be shifted
    input_data = data[input_list]
    input_cols = input_data.shape[1]
    
    #data to be predicted
    predict = data[predict_list]
    predict_cols = predict.shape[1]
    
    #shifting for input data
    for i in range(n_in,0,-1):
        columns.append(input_data.shift(i))
        columns_name += ["var%d (t-%d)" %(j+1,i) for j in range(input_cols)]
        
    #shifting for output data
    for i in range(n_out+1):
        columns.append(predict.shift(-i))
        if i == 0:
            columns_name += ["var%d (t)" % (j+1) for j in range(predict_cols)]
        else:
            columns_name += ["var%d (t+%d)" % (j+1,i) for j in range(predict_cols)]
            
    #connecting data
    df = pd.concat(columns,axis=1)
    df = pd.concat([data,df],axis=1)
    df.columns = columns_name
    
    #droping null columns
    if dropnan:
        df.dropna(inplace=True)
    
    return df

def forecast_acc(test, predictions,n_lag=28):
    rmse = 0
    test= test.values
    for i in range(n_lag):
        actual = test[:,i]
        prediction = predictions[:,i]
        rmse += np.sqrt(mean_squared_error(actual,prediction))
    return rmse

def train_test_split(data,train_ratio,ignore_list= list()):
    #getting number of days
    days = data["d"].nunique()
    train_days = int(train_ratio * days)
    train , test = data[data["d"]<train_days], data[data["d"]> train_days]
    
    #droping ignore list
    train = train.drop(ignore_list,axis=1)
    test = test.drop(ignore_list,axis=1)
    
    #input and output split
    train_X, train_y = train.iloc[:,:-28], train.iloc[:,-28:]
    test_X, test_y =   test.iloc[:,:-28], test.iloc[:,-28:]
    
    return train_X, train_y, test_X, test_y

## Fitting data on each item

as the ram available is limited we are going to fit the model on each item and store<br/>
prediction for each items.

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
items = data["id"].unique()
predictions = list()
it = 1
for i in items:
    print("iteration {}".format(it))
    temp = data[data["id"]==i]
    temp = timeseries_to_supervised(temp,input_list=["demand"],predict_list=["demand"])
    train_X, train_y, test_X, test_y = train_test_split(temp,0.8,["id","d","demand","date","wm_yr_wk"])
    
    del temp
    gc.collect()
    
    model = RandomForestRegressor(n_estimators=1000)
    model.fit(train_X,train_y)
    prediction = model.predict(test_X)
    predictions.append(prediction)
    del train_X,train_y,test_X,test_y
    gc.collect()
    it+=1

iteration 1
iteration 1


KeyboardInterrupt: 