In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

In [None]:
col = 'time'
train[col] = pd.to_datetime(train[col])

In [None]:
train['accum_minutes'] = (train[col] - train[col].dt.floor('D')).dt.total_seconds() / 60

In [None]:
# train.loc[train['accum_minutes']>=720, 'accum_minutes'].unique()
print(len(train['accum_minutes'].unique()))
train['accum_minutes'].unique()

In [None]:
train.loc[train['accum_minutes']>=720, 'accum_minutes'] = train.loc[train['accum_minutes']>=720, 'accum_minutes'] - 720
train['accum_minutes'] = train['accum_minutes'].map(int)

In [None]:
print(len(train['accum_minutes'].unique()))
train['accum_minutes'].unique()

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
features = [
    'diff_days',
    'datetime_element',
    'accum_minutes',
    'congestion',
    'x_y_direction'
    ]

In [None]:
train_x, test_x = load_datasets_both(features)

In [None]:
train_x.sort_values(['month', 'day', 'x_y_direction', 'pm', 'accum_minutes']).drop(['x_y_direction'], axis=1).head(72)

## モデルデータ確認

In [None]:
pd.read_pickle(MODEL_DIR_NAME + 'lgb_0306_1211/lgb_0306_1211-pred.pkl')

In [None]:
# display(train.shape, test.shape)

In [None]:
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)

In [None]:
# display(train.head(), train.tail(), train.shape)
# len(train[train['pca4'] > 0.01])
train.max()

In [None]:
display(train.describe(), test.describe())

In [None]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

## create submission

In [None]:
warnings.filterwarnings("ignore")

In [None]:
! python ../code/20_run.py