In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap
! pip install umap-learn
! pip install git+https://github.com/pfnet-research/xfeat.git

In [None]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
import os
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)

RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']

RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME_IMP']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME_IMP']

In [None]:
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
# 特徴量作成
# ! python 1_generate_feature.py

In [None]:
from dataclasses import dataclass
@dataclass
class LstmParams:
    task_type: str
    epochs: int
    batch_size: int

In [None]:
memo = np.empty(0)
ar1 = np.array([1,2,3])
ar2 = None

memo = np.append(memo, ar1)
memo = np.append(memo, ar2)
print(memo)

In [None]:
memo = np.empty(0, int)
# memo[0] = 10
memo
# isinstance(ta, LstmParams)

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

In [None]:
# 日付のrow_id確認
train['time'] = pd.to_datetime(train['time'])
# train.query('time >= "1991-09-30"')

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_train_y(target):
    df = pd.read_pickle(FEATURE_DIR_NAME + f'{target}_train.pkl')
    return pd.Series(df[target])

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
features = [
    # "shift_3days",
    "datetime_element",
    'decompose_direction',
    "agg_shift_by_date",
    # "rolling_30days",
    "diff_3days",
    'is_weekend',
    'agg_by_am',
    'accum_minutes_half_day',
    ]

target = 'congestion'

In [None]:
train_x, test_x = load_datasets_both(features)
train_y = load_train_y(target)

In [None]:
test_x

In [None]:
ts_base_feats = [
    'accum_minutes_half_day',
    'coordinate',
    'x_y_direction',
    'date_obj',
    'row_id'
]

# 順序も重要
base_cols = ['xydirection_re', 'pm', 'accum_minutes_half_day']

In [None]:
ts_col = 'date_obj'
dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in ts_base_feats]
df_ts_base_feats = pd.concat(dfs, axis=1)

# 時系列カラムの型変換
df_ts_base_feats[ts_col] = pd.to_datetime(df_ts_base_feats[ts_col])

In [None]:
# 特徴量データからsubmissionを作成する
# sub = test_x[['row_id', 'rolling50_median']]
# sub.columns = ['row_id', 'congestion']
# sub.to_csv(MODEL_DIR_NAME + 'moving-median-2/submission.csv', index=False)

In [None]:
# 特徴量データを保存する
# gcsを経由するように変更
# train_x.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/train_x.csv', index=False)
# test_x.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/test_x.csv', index=False)
# train_y.to_csv(MODEL_DIR_NAME + 'kaggle-notebook1/train_y.csv', index=False)

## gcs

In [None]:
# 特徴量保存
import os
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)
sys.path.append('./')
from google.cloud import storage
from gcs_client import StorageClient

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../gcs-key.json'
BUCKET_NAME = 'kaggleops-bucket-msm'
BLOB_NAME = 'data'
directry_path = f'../data/features/'

client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)
StorageClient.upload_gcs_from_directory(bucket, directry_path, BLOB_NAME)

## モデルデータ確認

In [None]:
import pandas as pd

pd.read_pickle(MODEL_DIR_NAME + 'lgb_0410_0541-debug/.lgb_0410_0541-debug-train.pkl')

## create submission

In [None]:
warnings.filterwarnings("ignore")

In [None]:
! python ../code/20_run.py