In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap
! pip install umap-learn
! pip install git+https://github.com/pfnet-research/xfeat.git

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)

RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']

# RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME_IMP']
# FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME_IMP']

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
# 特徴量作成
# ! python 1_generate_feature.py

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

In [None]:
# 日付のrow_id確認
train['time'] = pd.to_datetime(train['time'])
# train.query('time >= "1991-09-30"')

### memo(時系列データの欠損行の補完)


In [None]:
def impute_time_series(df, time_col, freq):
  df['merged_feat'] = df['x'].map(lambda x: str(x) + '_') + df['y'].map(lambda x: str(x) + '_') + df['direction']

  df[time_col] = pd.to_datetime(df[time_col])

  unique_time = pd.DataFrame(df[time_col].unique())
  max_time = unique_time.max()[0]
  min_time = unique_time.min()[0]

  # 完全な時系列 * 特徴量のnp.arrayを作成
  absolute_series_arrary = np.meshgrid(np.array(pd.DataFrame(pd.date_range(start=min_time, end=max_time, freq=freq))), df['merged_feat'].unique())

  absolute_series_index = pd.Series(absolute_series_arrary[0].flatten())
  absolute_series_values = pd.Series(absolute_series_arrary[1].flatten()).str.split('_', expand=True).rename(columns = {
    0: 'x',
    1: 'y',
    2: 'direction'
  })

  absolute_series = pd.concat([absolute_series_index, absolute_series_values], axis=1).rename(columns={0: 'time'})
  absolute_series['x'] = absolute_series['x'].map(int)
  absolute_series['y'] = absolute_series['y'].map(int)

  df_imputation = pd.merge(df, absolute_series, how='outer')
  print(df_imputation.shape)
  return df_imputation.sort_values(['time', 'x', 'y', 'direction', 'congestion']).drop(['merged_feat'], axis=1).reset_index(drop=True)

In [None]:
time_col = 'time'
freq = '20min'
df = impute_time_series(train, time_col, freq)

# # 欠損値補完
# df['congestion'] = df['congestion'].fillna(df['congestion'].mean())
df['original_row_id'] = df['row_id']
df['row_id'] = df.index
print(df.info())

RAW_DIR_NAME_IMP = yml['SETTING']['RAW_DIR_NAME_IMP']
# df.to_csv(RAW_DIR_NAME_IMP + 'train.csv', index=False)

In [None]:
col = 'direction'
target = 'congestion'
labels, encorded_arr = np.unique(df[col].to_numpy(), return_inverse=True)
df[col] = encorded_arr

In [None]:
df

In [None]:
# numpy変換
arr = df.to_records(index=False)
print(arr[:3])

# グループ化
n_labels = len(labels)
onehot = np.eye(arr[col].max()+1)[arr[col]]
onehot[onehot == 0] = np.nan
print(onehot.shape)

grp_arr =(onehot * arr[target].reshape(-1, 1)).transpose()
print(grp_arr.shape)

# 移動平均算出
def func(arr, i, window):
  indicies = np.where(~np.isnan(onehot.transpose()[i]))
  arr_by_label = arr[indicies]
  return np.nanmedian(sliding_window_view(arr_by_label, window), axis=1)

rolling_arr = np.array([func(arr, i, 10) for i, arr in enumerate(grp_arr)])

In [None]:
count = 0
for arr in rolling_arr:
  count += len(arr)
print(count)
# 853100 = 854028 + window(=9) * n_labels(=8)

In [None]:
grp_arr

In [None]:
func = lambda x: sliding_window_view(x, 3)
np.apply_along_axis(func, 1, grp_arr)

In [None]:
rolling_arr

In [None]:
from numpy.lib.stride_tricks import sliding_window_view

rolling_df = np.array([0, 1, np.nan, 3, 4, 5])
rolling_df = df['congestion']

print(sliding_window_view(rolling_df, 3)[[0,0,0,0]])
np.nanmedian(sliding_window_view(rolling_df, 3), axis=1)

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_train_y(target):
    df = pd.read_pickle(FEATURE_DIR_NAME + f'{target}_train.pkl')
    return pd.Series(df[target])

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
features = [
    "shift_3days",
    "datetime_element",
    'accum_minutes_half_day',
    'coordinate',
    'decompose_direction',
    "agg_shift_by_date",
    # "rolling_30days",
    "diff_3days",
    'is_weekend',
    'agg_by_am',
    ]

target = 'congestion'

In [None]:
train_x, test_x = load_datasets_both(features)
train_y = load_train_y(target)

In [None]:
row_ids = pd.read_pickle(FEATURE_DIR_NAME + 'row_id' + '_train.pkl')
row_ids.loc[keep_index, :]

In [None]:
# 特徴量データからsubmissionを作成する
# sub = test_x[['row_id', 'rolling50_median']]
# sub.columns = ['row_id', 'congestion']
# sub.to_csv(MODEL_DIR_NAME + 'moving-median-2/submission.csv', index=False)

In [None]:
# 特徴量データを保存する
# gcsを経由するように変更
train_x.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/train_x.csv', index=False)
test_x.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/test_x.csv', index=False)
train_y.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/train_y.csv', index=False)

## gcs

In [None]:
# 特徴量保存
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)
sys.path.append('./')
from google.cloud import storage
from gcs_client import StorageClient

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../gcs-key.json'
BUCKET_NAME = 'kaggleops-bucket-msm'
BLOB_NAME = 'data'
directry_path = f'../data/features/'

client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)
StorageClient.upload_gcs_from_directory(bucket, directry_path, BLOB_NAME)

## モデルデータ確認

In [None]:
pd.read_pickle(MODEL_DIR_NAME + 'lgb_0306_1211/lgb_0306_1211-pred.pkl')

In [None]:
# display(train.shape, test.shape)

In [None]:
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)

In [None]:
# display(train.head(), train.tail(), train.shape)
# len(train[train['pca4'] > 0.01])
train.max()

In [None]:
display(train.describe(), test.describe())

In [None]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

## create submission

In [None]:
warnings.filterwarnings("ignore")

In [None]:
! python ../code/20_run.py