In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from patsy import dmatrix
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# Pretty display for notebooks

%matplotlib inline


# Allows the use of display() for DataFrames
from IPython.display import display 

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
# train = pd.read_csv("../asset/train.csv")
# test = pd.read_csv("../asset/test.csv")
weather = pd.read_csv("./asset/weather+holyday_201807041434.csv")
# key = pd.read_csv("../asset/key.csv")
submission_example = pd.read_csv("../asset/sampleSubmission.csv")

# Success - Display the first record


# import pickle
import awesome_functions as cf

# 원본을 유지하기 위해서 카피
# df_train = train.copy()
df_weather = weather.copy()
# df_key = key.copy()
# df_test = test.copy()
df_submission = submission_example.copy()

In [2]:
df_submission.head()

Unnamed: 0,id,units
0,2_1_2013-04-01,0
1,2_2_2013-04-01,0
2,2_3_2013-04-01,0
3,2_4_2013-04-01,0
4,2_5_2013-04-01,0


In [3]:
df_submission.dtypes

id       object
units     int64
dtype: object

In [4]:
def store(a):
    return a.split("_")[0]

def item(a):
    return a.split("_")[1]

def date(a):
    return a.split("_")[2]

In [5]:
df_submission["store_nbr"] = df_submission["id"].apply(store)

In [6]:
df_submission["item_nbr"] = df_submission["id"].apply(item)

In [7]:
df_submission["date"] = df_submission["id"].apply(date)

In [8]:
df_submission.tail()

Unnamed: 0,id,units,store_nbr,item_nbr,date
526912,45_107_2014-10-26,0,45,107,2014-10-26
526913,45_108_2014-10-26,0,45,108,2014-10-26
526914,45_109_2014-10-26,0,45,109,2014-10-26
526915,45_110_2014-10-26,0,45,110,2014-10-26
526916,45_111_2014-10-26,0,45,111,2014-10-26


In [9]:
df_submission = df_submission.drop(columns=["id"])

In [10]:
df_submission.head()

Unnamed: 0,units,store_nbr,item_nbr,date
0,0,2,1,2013-04-01
1,0,2,2,2013-04-01
2,0,2,3,2013-04-01
3,0,2,4,2013-04-01
4,0,2,5,2013-04-01


In [11]:
df_weather.columns

Index(['station_nbr', 'date', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'snowfall',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'year', 'month', 'day', 'weekday', 'holiday'],
      dtype='object')

In [12]:
df = df_weather

In [13]:
# date 관련 column 만들어주기 
# weekday, friday, weekend 값 변수로 넣기
import datetime
def isweekend(dt) : 
    temp = str(dt)
    day, month, year = (int(x) for x in temp.split('-'))  
    ans = datetime.date(day, month, year)
    p = datetime.date.weekday(ans)
    if p <= 4 : 
        return 'weekday'
    elif (p == 5) :
        return 'Friday'
    else : 
        return 'weekend'
df['is_weekend'] = df['date'].apply(isweekend)

# holiday 추가해주기 
import re 
with open('./asset/holidays.txt') as file:  
    data = file.readlines()
    holidays = []
    BF = []
    months = ["", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    for holiday in data : 
        if re.findall("BlackFriday", holiday) != [] :
            BF.append(holiday.split(" ")[:3])
        elif re.findall('Thanksgiving',holiday) != []:
            BF.append(holiday.split(" ")[:3])
        else :
            holidays.append(holiday.split(" ")[:3])
    for i in range(len(holidays)): 
        month_idx = months.index(holidays[i][1])
        if month_idx < 10:
            holidays[i][1] = "0" + str(month_idx)
        else:
            holidays[i][1] = str(month_idx)
    for z in range(len(holidays)) : 
        if len(holidays[z][2]) == 1 : 
            holidays[z][2] = '0' + holidays[z][2]
    for z in range(len(BF)) :
        BF[z][1] = '11'
		
		
# holiday 구분하기 
def isholiday(dt): 
    temp = str(dt)
    if temp.split("-") in BF :
        return "black friday"
    elif temp.split("-") in holidays :
        return "holiday"
    else :
        return "regular day"

df['is_holiday'] = df['date'].apply(isholiday)

In [14]:
df.head()

Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,resultspeed,resultdir,avgspeed,year,month,day,weekday,holiday,is_weekend,is_holiday
0,1,2012-01-01,52,31,41,11,36,40,24,0,...,3.6,20.0,4.6,2012,1,1,6,holiday,weekend,holiday
1,1,2012-01-02,50,31,40,10,26,35,25,0,...,9.8,24.0,10.3,2012,1,2,0,regular day,weekday,holiday
2,1,2012-01-03,32,11,21,-8,4,18,44,0,...,10.8,31.0,11.6,2012,1,3,1,regular day,weekday,regular day
3,1,2012-01-04,28,9,18,-11,-1,14,47,0,...,6.3,27.0,8.3,2012,1,4,2,regular day,weekday,regular day
4,1,2012-01-05,38,25,31,1,13,25,34,0,...,6.9,25.0,7.8,2012,1,5,3,regular day,weekday,regular day


In [15]:
key = pd.read_csv("./asset/key.csv")

In [16]:
df_key = key.copy()

In [17]:
df_key

Unnamed: 0,store_nbr,station_nbr
0,1,1
1,2,14
2,3,7
3,4,9
4,5,12
5,6,14
6,7,6
7,8,4
8,9,17
9,10,12


In [18]:
df_key.dtypes

store_nbr      int64
station_nbr    int64
dtype: object

In [19]:
df_submission["store_nbr"] = df_submission["store_nbr"].astype(int)

In [20]:
df_submission.dtypes

units         int64
store_nbr     int32
item_nbr     object
date         object
dtype: object

In [25]:
df_submission = df_submission.merge(df_key, on="store_nbr")

In [26]:
df_weather

Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,resultspeed,resultdir,avgspeed,year,month,day,weekday,holiday,is_weekend,is_holiday
0,1,2012-01-01,52,31,41,11,36,40,24,0,...,3.6,20.0,4.6,2012,1,1,6,holiday,weekend,holiday
1,1,2012-01-02,50,31,40,10,26,35,25,0,...,9.8,24.0,10.3,2012,1,2,0,regular day,weekday,holiday
2,1,2012-01-03,32,11,21,-8,4,18,44,0,...,10.8,31.0,11.6,2012,1,3,1,regular day,weekday,regular day
3,1,2012-01-04,28,9,18,-11,-1,14,47,0,...,6.3,27.0,8.3,2012,1,4,2,regular day,weekday,regular day
4,1,2012-01-05,38,25,31,1,13,25,34,0,...,6.9,25.0,7.8,2012,1,5,3,regular day,weekday,regular day
5,1,2012-01-06,46,25,35,5,21,29,30,0,...,0.3,1.0,2.4,2012,1,6,4,regular day,weekday,regular day
6,1,2012-01-07,57,28,42,12,28,36,23,0,...,3.0,26.0,3.6,2012,1,7,5,regular day,Friday,regular day
7,1,2012-01-08,45,27,36,6,22,32,29,0,...,5.1,30.0,6.6,2012,1,8,6,regular day,weekend,regular day
8,1,2012-01-09,39,23,31,1,12,25,34,0,...,2.3,23.0,4.0,2012,1,9,0,regular day,weekday,regular day
9,1,2012-01-10,47,29,38,8,24,32,27,0,...,5.4,25.0,6.5,2012,1,10,1,regular day,weekday,regular day


In [28]:
df_submission = df_submission.merge(df_weather, on=["station_nbr","date"])

In [29]:
df_submission

Unnamed: 0,units,store_nbr,item_nbr,date,station_nbr,tmax,tmin,tavg,depart,dewpoint,...,resultspeed,resultdir,avgspeed,year,month,day,weekday,holiday,is_weekend,is_holiday
0,0,2,1,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
1,0,2,2,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
2,0,2,3,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
3,0,2,4,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
4,0,2,5,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
5,0,2,6,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
6,0,2,7,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
7,0,2,8,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
8,0,2,9,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
9,0,2,10,2013-04-01,14,71,42,56,1,41,...,9.3,4.0,11.0,2013,4,1,0,regular day,weekday,regular day
