In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

In [None]:
col = 'time'
train[col] = pd.to_datetime(train[col])

In [None]:
train['accum_minutes'] = (train[col] - train[col].dt.floor('D')).dt.total_seconds() / 60

In [None]:
# train.loc[train['accum_minutes']>=720, 'accum_minutes'].unique()
print(len(train['accum_minutes'].unique()))
train['accum_minutes'].unique()

In [None]:
train.loc[train['accum_minutes']>=720, 'accum_minutes'] = train.loc[train['accum_minutes']>=720, 'accum_minutes'] - 720
train['accum_minutes'] = train['accum_minutes'].map(int)

In [None]:
print(len(train['accum_minutes'].unique()))
train['accum_minutes'].unique()

#### test

In [None]:
col = 'time'
train[col] = pd.to_datetime(train[col])
train['accum_minutes'] = (train[col] - train[col].dt.floor('D')).dt.total_seconds() / 60

test[col] = pd.to_datetime(test[col])
test['accum_minutes'] = (test[col] - test[col].dt.floor('D')).dt.total_seconds() / 60

# # テストデータが午後のみのため、午前と午後に区別する
# train['pm'] = 0
# train.loc[train['accum_minutes']>=720, 'pm'] = 1
# train.loc[train['accum_minutes']>=720, 'accum_minutes'] = train.loc[train['accum_minutes']>=720, 'accum_minutes'] - 720
# train['accum_minutes'] = train['accum_minutes'].map(int)

# test['pm'] = 0
# test.loc[test['accum_minutes']>=720, 'pm'] = 1
# test.loc[test['accum_minutes']>=720, 'accum_minutes'] = test.loc[test['accum_minutes']>=720, 'accum_minutes'] - 720
# test['accum_minutes'] = test['accum_minutes'].map(int)


In [None]:
train['x-y-direction'] = train['x'].map(lambda x: str(x) + '_') + train['y'].map(lambda x: str(x) + '_') + train['direction']
pd.DataFrame(train['accum_minutes'].value_counts()).sort_index()
print(len(train))
print(len(pd.DataFrame(train['x-y-direction'].value_counts())))
print(len(train['time'].unique()))
print(len(train['time'].dt.round('D').unique()))
print(train['time'].min(), train['time'].max())
print(len(test['time'].unique()))
print(test['time'].min(), test['time'].max())

In [None]:
# 欠損行の補完
unique_time = pd.DataFrame(train['time'].unique())

max_time = unique_time.max()[0]
min_time = unique_time.min()[0]
print(min_time, max_time)

absolute_time_space = np.meshgrid(np.array(pd.DataFrame(pd.date_range(start='1991-04-01 00:00:00', end='1991-09-30 11:40:00', freq='20min'))), train['x-y-direction'].unique())

In [None]:
absolute_time = pd.Series(absolute_time_space[0].flatten())
absolute_space = pd.Series(absolute_time_space[1].flatten()).str.split('_', expand=True).rename(columns = {
  0: 'x',
  1: 'y',
  2: 'direction'
})
df_absolute_time_space = pd.concat([absolute_time, absolute_space], axis=1).rename(columns={0: 'time'})
df_absolute_time_space['x'] = df_absolute_time_space['x'].map(int)
df_absolute_time_space['y'] = df_absolute_time_space['y'].map(int)

In [None]:
train_imputation = pd.merge(train, df_absolute_time_space, how='outer')
train_imputation
train_imputation.iloc[:, :6].to_csv(RAW_DIR_NAME + 'train_imputation.csv')

In [None]:
train_imputation['time'] = pd.to_datetime(train_imputation['time'])
# for i in [1, 2, 3]:
i = 1
train_imputation['diff'] = train_imputation['time'] + timedelta(days=i)
train_imputation['time_categorical'] = train_imputation['time'].map(lambda x: str(x)+ '_') + train_imputation['x'].map(lambda x: str(x) + '_') + train_imputation['y'].map(lambda x: str(x) + '_') + train_imputation['direction']
train_imputation['diff_categorical'] = train_imputation['diff'].map(lambda x: str(x)+ '_') + train_imputation['x'].map(lambda x: str(x) + '_') + train_imputation['y'].map(lambda x: str(x) + '_') + train_imputation['direction']
df = pd.merge(train_imputation[['time_categorical', 'congestion']], train_imputation[['diff_categorical', 'congestion']], left_on='time_categorical', right_on='diff_categorical', how='left')
train_imputation[f'diff_{i}days'] = df['congestion_y']
train_imputation['time']

In [None]:
train_imputation

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_train_y(target):
    df = pd.read_pickle(FEATURE_DIR_NAME + f'{target}_train.pkl')
    return pd.Series(df[target])

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
features = [
    'diff_days',
    'datetime_element',
    'accum_minutes',
    'x_y_direction'
    ]

target = 'congestion'

In [None]:
train_x, test_x = load_datasets_both(features)
train_y = load_train_y(target)

In [None]:
train_x['date'] = train_x['month'].map(str) + '_' + train_x['day'].map(str)
keep_index = (train_x['diff_1days'].isna()) | (train_y.isna())
drop_dates = train_x.loc[keep_index, 'date'].unique()
t = pd.get_dummies(train_x.query('date not in @drop_dates').drop(['date'], axis=1))
t
# train_y[t.index]

In [None]:
pd.DataFrame(train_x['accum_minutes'].value_counts()).sort_index()

In [None]:
df = train_x.sort_values(['month', 'day', 'x_y_direction', 'pm', 'accum_minutes']).drop(['x_y_direction'], axis=1)
df = df.reset_index(drop=True)

In [None]:
print(len(df.query('pm==1').reset_index()))
df_pm = df.query('pm==1').reset_index()
df_pm.loc[~df_pm.isnull().any(axis=1)]

In [None]:
df_pm.loc[~df_pm.isnull().any(axis=1)]['accum_minutes'].value_counts()

In [None]:
train_y[train_x.query('pm==1').index]

In [None]:
train_x.sort_values(['month', 'day', 'x_y_direction', 'pm', 'accum_minutes']).drop(['x_y_direction'], axis=1).head(72)

## モデルデータ確認

In [None]:
pd.read_pickle(MODEL_DIR_NAME + 'lgb_0306_1211/lgb_0306_1211-pred.pkl')

In [None]:
# display(train.shape, test.shape)

In [None]:
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)

In [None]:
# display(train.head(), train.tail(), train.shape)
# len(train[train['pca4'] > 0.01])
train.max()

In [None]:
display(train.describe(), test.describe())

In [None]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

## create submission

In [None]:
warnings.filterwarnings("ignore")

In [None]:
! python ../code/20_run.py