In [None]:
!pip install jovian opendatasets xgboost graphviz lightgbm scikit-learn xgboost lightgbm --upgrade --quiet

In [None]:
import os 
import pandas as pd
import numpy as np
import sklearn
import opendatasets as od
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

In [None]:
od.download('https://www.kaggle.com/c/rossmann-store-sales')

In [None]:
os.listdir('rossmann-store-sales')

In [None]:
ross_df = pd.read_csv('./rossmann-store-sales/train.csv', low_memory=False)
store_df = pd.read_csv('./rossmann-store-sales/store.csv')
test_df = pd.read_csv('./rossmann-store-sales/test.csv')
submission_df = pd.read_csv('./rossmann-store-sales/sample_submission.csv')

In [None]:
ross_df

In [None]:
test_df

In [None]:
store_df

In [None]:
merged_df = ross_df.merge(store_df,how = 'left' , on = 'Store')
merged_test_df = test_df.merge(store_df,how = 'left' , on = 'Store')

In [None]:
merged_df

In [None]:
merged_test_df

In [None]:
merged_df.info()

In [None]:
merged_df.isna().sum()

In [None]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week
    

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df

In [None]:
merged_df[merged_df.Open == 0].Sales.value_counts()

In [None]:
merged_df = merged_df[merged_df.Open == 1].copy()

In [None]:
merged_df.shape

In [None]:
def comp_duration(df):
    df['Comp_months'] = 12*(df.Year - df.CompetitionOpenSinceYear) + df.CompetitionOpenSinceMonth
    df['Comp_months'] = df['Comp_months'].map(lambda x:0 if x<0 else x).fillna(0)

In [None]:
comp_duration(merged_df)
comp_duration(merged_test_df)

In [None]:
merged_df

In [None]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Calculate months since Promo2 started
    promo2_open = 12 * (df['Year'] - df['Promo2SinceYear']) + (df['WeekOfYear'] - df['Promo2SinceWeek']) * 7 / 30.5
    promo2_open = promo2_open.fillna(0).map(lambda x: max(x, 0))
    df['Promo2Open'] = promo2_open * df['Promo2']
    
    # Calculate whether the current month is a promo month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1)
    df['IsPromo2Month'] = df['IsPromo2Month'].fillna(0).astype(int) * df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df.columns

In [None]:
input_cols =['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'Comp_months', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
target_cols = ['Sales']

In [None]:
input_cols

In [None]:
inputs = merged_df[input_cols].copy()
targets = merged_df['Sales'].copy()
test_inputs = merged_test_df[input_cols].copy()

In [None]:
inputs

In [None]:
targets

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'Comp_months', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear', ]

cat_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
maxi = inputs['CompetitionDistance'].max()

In [None]:
maxi

In [None]:
inputs['CompetitionDistance'].fillna(maxi*2,inplace = True)
test_inputs['CompetitionDistance'].fillna(maxi*2,inplace = True)

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(inputs[numeric_cols])

In [None]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
inputs

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoded_cols = list(encoder.get_feature_names_out(cat_cols))

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore').fit(inputs[cat_cols])


In [None]:
inputs[encoded_cols] = encoder.transform(inputs[cat_cols])