In [25]:
import os
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from mpl_toolkits import mplot3d
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [3]:
dataset = pd.read_csv("train.csv", names=['Store','Dept','Date','weeklySales','isHoliday'],sep=',', header=0,parse_dates=['Date'])
features = pd.read_csv("features.csv",sep=',', header=0,
                       names=['Store','Date','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4',
                              'MarkDown5','CPI','Unemployment','IsHoliday'],parse_dates=['Date']).drop(columns=['IsHoliday'])
stores = pd.read_csv("stores.csv", names=['Store','Type','Size'],sep=',', header=0)
dataset = dataset.merge(stores, how='left').merge(features, how='left')
dataset.dtypes

Store                    int64
Dept                     int64
Date            datetime64[ns]
weeklySales            float64
isHoliday                 bool
Type                    object
Size                     int64
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
dtype: object

In [4]:
dataset['Temperature'] = (dataset['Temperature'] - 32) * 5/9
dataset['Month']=dataset['Date'].dt.month
dataset.isnull().sum()

Store                0
Dept                 0
Date                 0
weeklySales          0
isHoliday            0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
Month                0
dtype: int64

In [5]:
###Filling 0 for missing values of Markdown 1,2,3,4,5
dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset.isnull().sum()
###There is no more NaN in data

Store           0
Dept            0
Date            0
weeklySales     0
isHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Month           0
dtype: int64

In [6]:
dataset.loc[dataset["Date"]=='2010-02-12']["weeklySales"].mean()
#superbowlday

16352.05603179975

In [None]:
dataset.loc[dataset["Date"]=='2010-09-10']["weeklySales"].mean()
#labor day

15537.758883214183

In [None]:
dataset.loc[dataset["Date"]=='2010-11-26']["weeklySales"].mean()
#thanksgiving day

22403.33670524169

In [None]:
dataset.loc[(dataset["Date"]=='2010-12-24')|(dataset["Date"]=='2010-12-23')]["weeklySales"].mean()
#christmas day

27378.69269282817

In [None]:
dataset.loc[(dataset['Date'].dt.year==2010)]["weeklySales"].mean()
#yearly mean

16270.275737033313

In [7]:
dataset.loc[(dataset['Date'].dt.year==2010) & (dataset["weeklySales"]>=240000)]['Date'].value_counts()

2010-11-26    23
2010-12-24    14
2010-12-17     2
2010-02-05     1
Name: Date, dtype: int64

In [8]:
dataset.loc[(dataset['Date'].dt.year==2011) & (dataset["weeklySales"]>=240000)]['Date'].value_counts()

2011-11-25    22
2011-12-23     9
2011-08-26     1
Name: Date, dtype: int64

In [9]:
dataset = pd.get_dummies(dataset, columns=["Type","isHoliday"])

In [None]:
dataset.head()

In [10]:
dataset['Black_Friday'] = np.where((dataset['Date']==pd.Timestamp(2010, 11, 26)) | (dataset['Date']==pd.Timestamp(2011, 11, 25)),"Yes","No")
dataset['Pre_Christmas'] = np.where((dataset['Date']==pd.Timestamp(2010, 12, 23)) | (dataset['Date']==pd.Timestamp(2010, 12, 24)) | (dataset['Date']==pd.Timestamp(2011, 12, 23)) | (dataset['Date']==pd.Timestamp(2011, 12, 24)),"Yes","No")
dataset = pd.get_dummies(dataset, columns=["Black_Friday","Pre_Christmas"])

In [None]:
dataset.head()

In [43]:
dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset['Month'] = pd.to_datetime(dataset['Date']).dt.month
dataset = dataset.drop(columns=["Date", "CPI", "Fuel_Price", 'Unemployment', 'MarkDown3'])
selector=['Store', 'Dept', 'Size', 'Temperature',
        'MarkDown1', 'MarkDown2', 'MarkDown4',
       'MarkDown5', 'Month', 'Type_A', 'Type_B',
       'Type_C', 'isHoliday_False', 'isHoliday_True', 'Black_Friday_No',
       'Black_Friday_Yes', 'Pre_Christmas_No', 'Pre_Christmas_Yes']
##we will edit selectors based on EDA analysis

In [44]:
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(dataset[selector],dataset['weeklySales'], test_size=0.2, random_state=42)

In [45]:
X_train.head()

Unnamed: 0,Store,Dept,Size,Temperature,MarkDown1,MarkDown2,MarkDown4,MarkDown5,Month,Type_A,Type_B,Type_C,isHoliday_False,isHoliday_True,Black_Friday_No,Black_Friday_Yes,Pre_Christmas_No,Pre_Christmas_Yes
138466,15,3,123737,-0.922222,0.0,0.0,0.0,0.0,4,0,1,0,1,0,1,0,1,0
289214,30,25,42988,28.616667,0.0,0.0,0.0,0.0,6,0,0,1,1,0,1,0,1,0
52351,6,27,202505,26.355556,0.0,0.0,0.0,0.0,6,1,0,0,1,0,1,0,1,0
203504,21,49,140167,9.288889,3389.1,43.0,325.35,8623.67,12,0,1,0,1,0,1,0,1,0
233606,24,55,203819,0.477778,7325.68,25367.9,1745.2,3261.35,1,1,0,0,1,0,1,0,1,0


In [56]:
from sklearn.metrics import mean_absolute_error
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_val)
def calculate_error(y_val, y_pred):
    return mean_absolute_error(y_val, y_pred)
calculate_error(y_val, y_pred)

9668.212736947598