In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap
! pip install umap-learn
! pip install git+https://github.com/pfnet-research/xfeat.git

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
# ! python 1_generate_feature.py

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
train_imputation = pd.read_csv(RAW_DIR_NAME + 'train_imputation.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

### memo(探索的データ分析)

In [None]:
# 表形式特徴量算出
# train['time'] = pd.to_datetime(train['time'])
# display(train.describe(), train.describe(exclude='number'))
# #csvで保存するようにする？

# 単変数
# x = 'direction'
# df = train.query('x==2 and y==3')
# target = 'congestion'
# func = sns.distplot

# def plot_mono_variables(df, x, target, func):
#   unique_variables = train[x].unique()
#   n_xaxis = len(unique_variables)
#   fig, axes = plt.subplots(1, n_xaxis, figsize=(6*n_xaxis, 6))
#   print(unique_variables, x)
#   for i, v in enumerate(unique_variables):
#     df = df.loc[(df[x]==v), :]
#     func(df[target], ax=axes[i])
#     axes[i].set_title(f'{x}={v}')

#   plt.subplots_adjust(hspace=0.4)
#   plt.show()

# plot_mono_variables(df, x, target, func)

# 変数間のグラフ
# カテゴリカル変数　× 数値変数
# fig, ax = plt.subplots(1, 3, figsize=(20, 10))
# sns.boxplot(data=train, x='direction', y='congestion', ax=ax[0])
# sns.boxplot(data=train, x='x', y='congestion', ax=ax[1])
# sns.boxplot(data=train, x='y', y='congestion', ax=ax[2])

# train['direction'].unique()
# train['x'].unique()
# train['y'].unique()

# 2変量
def plot_di_variables(df, x, y, target, func):
  unique_variables = np.meshgrid(df[x].unique(), df[y].unique())
  n_xaxis=len(df[x].unique())
  n_yaxis=len(df[y].unique())

  fig, axes = plt.subplots(n_yaxis, n_xaxis, figsize=(10*n_yaxis, 10*n_xaxis))

  for i in range(0, n_yaxis):
    for j in range(0, n_xaxis):
      x_value = unique_variables[0][i][j]
      y_value = unique_variables[1][i][j]
      print(x_value, y_value)
      data = df.loc[(df[x]==x_value) & (df[y]==y_value), :]
      func(data[target], ax=axes[i, j])
      axes[i,j].set_title(f'{x}={x_value}, {y}={y_value}')

  plt.subplots_adjust(hspace=0.4)
  plt.show()

# df = train.query('x==2 and y==3')
x = 'x'
y = 'y'
target = 'congestion'
func = sns.distplot
plot_di_variables(train, x, y, target, func)

### memo(時系列データの欠損行の補完)


In [None]:
# 時系列データにおける欠損行の補完
def to_imputation(df, time_col, freq):
  df['merged_feat'] = df['x'].map(lambda x: str(x) + '_') + df['y'].map(lambda x: str(x) + '_') + df['direction']

  df[time_col] = pd.to_datetime(df[time_col])

  unique_time = pd.DataFrame(df[time_col].unique())
  max_time = unique_time.max()[0]
  min_time = unique_time.min()[0]

  # 完全な時系列 * 特徴量のnp.arrayを作成
  absolute_series_arrary = np.meshgrid(np.array(pd.DataFrame(pd.date_range(start=min_time, end=max_time, freq=freq))), df['merged_feat'].unique())

  absolute_series_index = pd.Series(absolute_series_arrary[0].flatten())
  absolute_series_values = pd.Series(absolute_series_arrary[1].flatten()).str.split('_', expand=True).rename(columns = {
    0: 'x',
    1: 'y',
    2: 'direction'
  })

  absolute_series = pd.concat([absolute_series_index, absolute_series_values], axis=1).rename(columns={0: 'time'})
  absolute_series['x'] = absolute_series['x'].map(int)
  absolute_series['y'] = absolute_series['y'].map(int)

  df_imputation = pd.merge(df, absolute_series, how='outer')
  print(df_imputation.shape)
  return df_imputation.drop(['merged_feat'], axis=1)

In [None]:
time_col = 'time'
freq = '20min'

df = to_imputation(train, time_col, freq)
display(df)
# df.to_csv('/content/drive/Othercomputers/macbook-air/TPS-Mar2022/data/raw/train_imputation.csv', index=False, mode='w')

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_train_y(target):
    df = pd.read_pickle(FEATURE_DIR_NAME + f'{target}_train.pkl')
    return pd.Series(df[target])

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
features = [
      # "diff_3days",
      # "datetime_element",
      # "accum_minutes",
      # "x_y_direction_dummies",
      # "agg_shift_by_date"
      'rawdata',
      'accum_minutes',
      'congestion'
    ]

target = 'congestion'

In [None]:
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
train_x, test_x = load_datasets_both(features)
train_y = load_train_y(target)

In [None]:
train_x

## memo(移動平均)

In [None]:
cols = ['accum_minutes', 'direction', 'x', 'y']
agg_cols = ['min', 'max', 'mean', 'median']
target_col = 'congestion'
time_col = 'time'

train[time_col] = pd.to_datetime(train[time_col])
train['accum_minutes'] = (train[time_col] - train[time_col].dt.floor('D')).dt.total_seconds() / 60
n_train = len(train)

test[time_col] = pd.to_datetime(test[time_col])
test['accum_minutes'] = (test[time_col] - test[time_col].dt.floor('D')).dt.total_seconds() / 60
n_test = len(test)

train_and_test = pd.concat([train, test]).reset_index(drop=True)

# 1階差分shiftさせる（過去データを含まないようにするため）
train_and_test[target_col] = train_and_test.groupby(cols)[target_col].shift(1)
grp_df = train_and_test.groupby(cols)[target_col]

outputs = []
for i in [50]:
    rolling_df = grp_df.rolling(i).agg(agg_cols)
    rolling_df = pd.DataFrame(rolling_df).add_prefix(f'rolling{i}_')
    rolling_df.index = rolling_df.index.map(lambda x: x[4])
    outputs.append(rolling_df.sort_index())

In [None]:
df = pd.concat(outputs, axis=1).iloc[n_train:, :].reset_index(drop=True)
df = df.round()
submission_mean = df[['rolling50_mean']]
submission_mean.columns = ['congestion']
submission_mean = pd.concat([test, submission_mean], axis=1)
# save_dir = MODEL_DIR_NAME + '/mean-sub7/'
# submission_mean[['row_id', 'congestion']].to_csv(save_dir + 'submission.csv', index=False)

submission_median = df[['rolling50_median']]
submission_median.columns = ['congestion']
submission_median = pd.concat([test, submission_median], axis=1)
# save_dir = MODEL_DIR_NAME + '/median-sub8/'
# submission_median[['row_id', 'congestion']].to_csv(save_dir + 'submission.csv', index=False)

In [None]:
submission_median['congestion'] - submission_mean['congestion']

In [None]:
outputs = [df]
grp_df = df.groupby(group)[feature_cols]

for lag in [-3, -2, -1, 1, 2, 3]:
   # shift
   outputs.append(grp_df.shift(lag).add_prefix(f'shift{lag}_'))
   # diff
   outputs.append(grp_df.diff(lag).add_prefix(f'diff{lag}_'))

## gcs

In [None]:
# 特徴量保存
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)
sys.path.append('./')
from google.cloud import storage
from gcs_client import StorageClient

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../gcs-key.json'
BUCKET_NAME = 'kaggleops-bucket-msm'
BLOB_NAME = 'data'
directry_path = f'../data/features/'

client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)
StorageClient.upload_gcs_from_directory(bucket, directry_path, BLOB_NAME)

## モデルデータ確認

In [None]:
pd.read_pickle(MODEL_DIR_NAME + 'lgb_0306_1211/lgb_0306_1211-pred.pkl')

In [None]:
# display(train.shape, test.shape)

In [None]:
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)

In [None]:
# display(train.head(), train.tail(), train.shape)
# len(train[train['pca4'] > 0.01])
train.max()

In [None]:
display(train.describe(), test.describe())

In [None]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

## create submission

In [None]:
warnings.filterwarnings("ignore")

In [None]:
! python ../code/20_run.py