In [1]:
#Library Imports
import random
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

# pd.options.display.max_rows = 200

In [2]:
train = pd.read_csv('./data/train.csv')
submission =pd.read_csv('./data/sample_submission.csv')

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## 전처리

In [4]:
train_list = []
test_list = []

In [5]:
for i in tqdm(range(len(train))):
    train_list.append(train.iloc[[i], 6:].T.reset_index().rename(columns={'index': 'date_time', i: 'sales'}))
    train_list[i]['date_time'] = pd.to_datetime(train_list[i]['date_time'])
    train_list[i]['year'] = train_list[i]['date_time'].dt.year
    train_list[i]['month'] = train_list[i]['date_time'].dt.month
    train_list[i]['week'] = train_list[i]['date_time'].dt.weekofyear
    train_list[i]['day'] = train_list[i]['date_time'].dt.weekday
    train_list[i]['holiday'] = train_list[i].apply(lambda x : 0 if x['day']<5 else 1, axis = 1)


    test_list.append(submission.iloc[[i],1:].T.reset_index().rename(columns={'index': 'date_time', i: 'sales'}))
    test_list[i]['date_time'] = pd.to_datetime(test_list[i]['date_time'])
    test_list[i]['year'] = test_list[i]['date_time'].dt.year
    test_list[i]['month'] = test_list[i]['date_time'].dt.month
    test_list[i]['week'] = test_list[i]['date_time'].dt.weekofyear
    test_list[i]['day'] = test_list[i]['date_time'].dt.weekday
    test_list[i]['holiday'] = test_list[i].apply(lambda x : 0 if x['day']<5 else 1, axis = 1)

100% 15890/15890 [02:55<00:00, 90.57it/s]


In [6]:
y_train_list = []
y_test_list = []

In [7]:
for i in tqdm(range(len(train))):
    y_train_list.append(train_list[i]['sales'])
    y_test_list.append(test_list[i]['sales'])

    train_list[i] = train_list[i].drop(columns='sales')
    test_list[i] = test_list[i].drop(columns='sales')

    train_list[i] = train_list[i].drop(columns='date_time')
    test_list[i] = test_list[i].drop(columns='date_time')

100% 15890/15890 [00:21<00:00, 753.59it/s]


In [8]:
import lightgbm as lgb
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 

In [9]:
model = lgb.LGBMRegressor(objective='mean_absolute_error', metric ='mean_absolute_error', random_state=42)

In [10]:
pred_list = []

In [11]:
for i in tqdm(range(len(train_list))):
    kfold = KFold(n_splits=17, shuffle = False)
    scores = cross_val_score(model, train_list[i], y_train_list[i], cv=kfold, scoring='neg_mean_absolute_error')    
    model.fit(train_list[i], y_train_list[i])
    predictions = model.predict(test_list[i])
    predictions[predictions < 0] = 0
    predictions = np.round(predictions)
    pred_list.append(predictions)

100% 15890/15890 [3:11:40<00:00,  1.38it/s]  


In [12]:
submit =pd.read_csv('./data/sample_submission.csv')

In [13]:
for i in tqdm(range(len(train_list))):
    submit.iloc[i,1:] = pred_list[i]

100% 15890/15890 [00:01<00:00, 13576.51it/s]


In [17]:
#submit.to_csv('submission_version_04.csv', index=False)

In [18]:
submit

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4,5,5,5,5,4,4,4,5,...,5,4,4,4,5,5,5,5,4,4
2,2,2,1,1,1,1,3,3,3,3,...,2,2,2,3,3,2,2,2,2,2
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0,0,1,2,2,0,0,0,0,...,2,3,3,4,7,10,10,10,3,3
15886,15886,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15887,15887,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,15888,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
