In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error


In [89]:
train = pd.read_csv("./Dataset/train.csv", parse_dates=['date'])
stores = pd.read_csv('./Dataset/stores.csv')
transactions = pd.read_csv('./Dataset/transactions.csv', parse_dates=['date'])
oil = pd.read_csv('./Dataset/oil.csv', parse_dates=['date'])
events = pd.read_csv('./Dataset/holidays_events.csv', parse_dates=['date'])

transfer_dates = events.loc[events['type'] == 'Transfer', 'date']
holiday_dates  = events.loc[
    (events['type'] == 'Holiday') & (~events['transferred']),
    'date'
]
real_holidays = pd.concat([transfer_dates, holiday_dates]).drop_duplicates()


In [90]:
df = train.merge(stores, on = 'store_nbr', how = 'left').merge(transactions, on = ['store_nbr', 'date'], how = 'left') \
.merge(oil, on ='date', how = 'left').merge(events, on = 'date', how='left')

In [91]:

'''df = df.set_index('date').sort_index()
df['dcoilwtico'] = df['dcoilwtico'].interpolate(method='time')

monthly_means = df.groupby(df.index.month)['dcoilwtico'].transform('mean')
df['dcoilwtico'] = df['dcoilwtico'].fillna(monthly_means)'''

df['dcoilwtico'] = df['dcoilwtico'].ffill().bfill()



## Feature engineering

In [92]:
def create_features(df):
    df['dayofweek'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['weekends'] = df['dayofweek'].isin([5,6]).astype(int)

    df['promo']  = df['onpromotion'].astype(int)
    df['family']= df['family'].astype('category').cat.codes

    df['is_holiday'] = df['date'].isin(real_holidays).astype(int)
    df['is_bridge_day'] = (df[type] == 'Bridge').astype(int)
    df['is_work_day'] = (df['type'] == 'Work Day').astype(int)
    df['is_additional_holiday'] = (df['type'] == 'Additional').astype(int)

    df['day'] = df['date'].dt.day
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_payday'] = ((df['day'] == 15) | (df['is_month_end'] == 1)).astype(int)

    quake_start = pd.to_datetime('2014-06-16')
    quake_end = quake_start + pd.Timedelta(days=30)
    df['post_earthquake'] = (df['date'] >= quake_start & df['date'] <= quake_end).astype(int)

    df['transactions'] = df['transactions'].fillna(0)
    df['oil_missing'] = df['dcoilwtico'].isna().astype(int)
    df['is_holiday'] = df['date'].isin(real_holidays).astype(int)

    return df


    

## EDA

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 13 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            int64         
 1   date          datetime64[ns]
 2   store_nbr     int64         
 3   family        object        
 4   sales         float64       
 5   onpromotion   int64         
 6   city          object        
 7   state         object        
 8   type          object        
 9   cluster       int64         
 10  transactions  float64       
 11  dcoilwtico    float64       
 12  oil_missing   int64         
dtypes: datetime64[ns](1), float64(3), int64(5), object(4)
memory usage: 297.6+ MB


In [73]:
df.shape

(3000888, 13)

In [74]:
df.head(50)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions,dcoilwtico,oil_missing
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
5,5,2013-01-01,1,BREAD/BAKERY,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
6,6,2013-01-01,1,CELEBRATION,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
7,7,2013-01-01,1,CLEANING,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
8,8,2013-01-01,1,DAIRY,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1
9,9,2013-01-01,1,DELI,0.0,0,Quito,Pichincha,D,13,0.0,93.14,1


In [75]:
df.iloc[:,3:].describe()

Unnamed: 0,sales,onpromotion,cluster,transactions,dcoilwtico,oil_missing
count,3000888.0,3000888.0,3000888.0,3000888.0,3000888.0,3000888.0
mean,357.7757,2.60277,8.481481,1555.808,67.9249,0.3093824
std,1101.998,12.21888,4.649735,1033.367,25.66913,0.4622391
min,0.0,0.0,1.0,0.0,26.19,0.0
25%,0.0,0.0,4.0,930.0,46.3775,0.0
50%,11.0,0.0,8.5,1331.0,53.41,0.0
75%,195.8473,0.0,13.0,1976.25,95.72,1.0
max,124717.0,741.0,17.0,8359.0,110.62,1.0


In [None]:
df['dcoilwtico'].mean()

np.float64(67.88419301164721)

In [59]:
first_valid = df['dcoilwtico'].first_valid_index()
print(first_valid)

first_valid = df['transactions'].first_valid_index()
print(first_valid)

mask = df['dcoilwtico'].isna()

first_invalid = mask.idxmax() if mask.any() else None

print("First null at:", first_invalid)




2013-01-01 00:00:00
2013-01-01 00:00:00
First null at: None


In [None]:
transfer_dates = events.loc[events['type'] == 'Transfer', 'date']
holiday_dates  = events.loc[
    (events['type'] == 'Holiday') & (~events['transferred']),
    'date'
]
real_holidays = pd.concat([transfer_dates, holiday_dates]).drop_duplicates()
