In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import yaml
import sys
import pandas as pd
import numpy as np
import json
dir_path = '/content/drive/Othercomputers/macbook-air/TPS-Mar2022/code'
os.chdir(dir_path)

In [None]:
! pip install japanize-matplotlib
! pip install shap
! pip install umap-learn
! pip install git+https://github.com/pfnet-research/xfeat.git

In [None]:
CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)

RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
EDA_DIR_NAME = yml['SETTING']['EDA_DIR_NAME']

# RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME_IMP']
# FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME_IMP']

In [None]:
# ! python 1_generate_feature.py

## 生データ確認

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

## datasets確認

In [None]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_train_y(target):
    df = pd.read_pickle(FEATURE_DIR_NAME + f'{target}_train.pkl')
    return pd.Series(df[target])

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
run_name = 'lgb_0325_0159'
json_open = open(f'../models/{run_name}/{run_name}_param.json', 'r')
params = json.load(json_open)
pred = pd.read_pickle(f'../models/{run_name}/.{run_name}-train.pkl')
used_features = params['load_features']
target = params['dataset']['target']

print('バリデーション:', params['cv']['method'])
print('特徴量:', used_features)
print('目的変数:', target)

In [None]:
add_features = ['row_id', 'date_obj', 'direction']
features = used_features + add_features

train_x, test_x = load_datasets_both(features)
train_y = load_train_y(target)
train_x_y = pd.concat([train_x, train_y], axis=1)

In [None]:
keep_index = ~train_x.isna().any(axis=1) & ~train_y.isna()
train_x, train_y = train_x.loc[keep_index, :].reset_index(), train_y[keep_index].reset_index()

In [None]:
true_data = pd.concat([train_x['row_id'], train_y[target]], axis=1)
pred_true = pred.merge(true_data).rename(columns={target: 'trueth'})
print('mae:', np.mean(abs(pred_true['pred'] - pred_true['trueth'])))

In [None]:
# 必要な特徴量を結合
pred_true = pred_true.merge(train_x[['row_id', 'x', 'y', 'direction', 'date_obj', 'accum_minutes']])
pred_true.head()

In [None]:
# predとtruethを1つの列にする
df_pred = pred_true.drop(['trueth'], axis=1).rename(columns = {'pred': target})
df_true = pred_true.drop(['pred'], axis=1).rename(columns = {'trueth': target})
df_pred['flag'] = 'pred'
df_true['flag'] = 'trueth'
df_fig = pd.concat([df_pred, df_true]).reset_index(drop=True)
df_fig.head()

In [None]:
# ずれが大きい軸を調査
pred_true['ae'] = abs(pred_true['pred'] - pred_true['trueth'])
pred_true.groupby(['x', 'y', 'direction']).mean().join(pred_true.groupby(['x', 'y', 'direction']).std()['ae'].rename('std_ae')).sort_values('ae', ascending=False).head(30)

In [None]:
sys.path.append('./src')
sys.path.append('./src/figures')
from src.figures.line_plots import PlotSeries5axis

setting = {
    'run_name': '',  # run名
    'feature_dir_name': FEATURE_DIR_NAME,  # 特徴量の読み込み先ディレクトリ
}

params = {
    'col': 'x',
    'row': 'y',
    'x': 'accum_minutes',
    'y': target,
    'z': 'flag',
    'is_xlim': True,
    'is_ylim': True
}

# i = 2
# unique_directions = pred_true['direction'].unique()
# direction = unique_directions[i]
direction = 'NE'

print(direction)
data = df_fig.query('direction == @direction')

ps_5axis = PlotSeries5axis(params, features, setting=setting)
ps_5axis.data = data
ps_5axis.create_figure()

In [None]:
x = 0
y = 0
row_ids = pred_true['row_id'].unique()
cols = ['date_obj', 'weekday', 'accum_minutes', 'shift1_congestion', 'shift2_congestion', 'shift3_congestion', 'date_direction_x_y_shift1_mean', 'rolling30_mean', 'congestion']
train_x_y.query('direction==@direction & x==@x & y==@y & row_id in @row_ids').sort_values('accum_minutes')[cols]

In [None]:
cols = ['date_obj', 'accum_minutes', 'x', 'y', 'direction', 'congestion']

train_x_y['date_obj'] = pd.to_datetime(train_x_y['date_obj'])
train_x_y.query('direction==@direction & x==@x & y==@y & date_obj=="1991-09-23"').sort_values('accum_minutes')[cols].head(30)

In [None]:
cols = ['date_obj', 'x', 'y', 'direction', 'weekday']
pd.DataFrame(train_x_y.groupby(cols)['congestion'].mean())

## 画像保存
