In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ディレクトリ移動
import os
comp_name = "tabular-playground-series-mar-2022"
root_dir_name = '/content/drive/Othercomputers/macbook-air/'
comp_dir_name = "TPS-Mar2022"
dir_path = f'{root_dir_name}{comp_dir_name}/notebooks'
os.chdir(dir_path)

In [None]:
# パスの定義
import yaml
CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.safe_load(file)

MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_DIR_NAME']
RAW_DIR_NAME = yml['SETTING']['RAW_DIR_NAME']
SUB_DIR_NAME = yml['SETTING']['SUB_DIR_NAME']
EDA_DIR_NAME = yml['SETTING']['EDA_DIR_NAME']


os.environ['KAGGLE_CONFIG_DIR'] = f'{root_dir_name}{comp_dir_name}'

In [None]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm as tqdm

## pickleファイルの読み込み

In [None]:
train = pd.read_csv(RAW_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DIR_NAME + 'test.csv')

In [None]:
print(train.shape[-2:])
print(train.shape)

In [None]:
class FigurePlot():
    def __init__(self, model_params, features, setting):
        self.run_name = setting.get('run_name')
        self.params = model_params
        self.features = features
        self.feature_dir_name = setting.get('feature_dir_name')
        self.out_dir_name = setting.get('out_dir_name')
        self.train = self.load_train()
        self.test = self.load_test()

    def run(self):
        self.create_figure(col, row, x, y, z=None, estimator=None, ci=None, is_xlim=None, is_ylim=None)
        plt.savefig(self.base_dir_name + f'{self.run_name}.png', dpi=300, bbox_inches="tight")
        plt.close()

    def run_and_save(self):
        self.create_figure(self.params)
        plt.savefig(self.out_dir_name + f'{self.run_name}.png', dpi=300, bbox_inches="tight")
        plt.close()

    def load_train(self) -> pd.DataFrame:
        """学習データの特徴量を読み込む
        列名で抽出する以上のことを行う場合、このメソッドの修正が必要
        :return: 学習データの特徴量
        """
        # 学習データの読込を行う
        dfs = [pd.read_pickle(self.feature_dir_name + f'{f}_train.pkl') for f in self.features]
        df = pd.concat(dfs, axis=1)

        # 特定の値を除外して学習させる場合 -------------
        # self.remove_train_index = df[(df['age']==64) | (df['age']==66) | (df['age']==67)].index
        # df = df.drop(index = self.remove_train_index)
        # -----------------------------------------
        return df

    def load_test(self) -> pd.DataFrame:
        """テストデータの特徴量を読み込む
        :return: テストデータの特徴量
        """
        dfs = [pd.read_pickle(self.feature_dir_name + f'{f}_test.pkl') for f in self.features]
        df = pd.concat(dfs, axis=1)
        return df

In [None]:
class PlotSeries5axis(FigurePlot):
  def create_figure(self, params):
    col = params.get('col')
    row = params.get('row')
    x = params.get('x')
    y = params.get('y')
    z = params.get('z')
    estimator = params.get('estimator')
    ci = params.get('ci')
    is_xlim = params.get('is_xlim')
    is_ylim = params.get('is_ylim')
    
    estimator = 'mean' if estimator is None else estimator
    ci = 95 if ci is None else ci

    self.run_name = '-'.join(filter(None, [col, row, x, y, z])) + self.run_name
    self.data = self.train
    
    col_categorical_uniques = self.data[col].unique()
    row_categorical_uniques = self.data[row].unique()
    n_col = len(col_categorical_uniques)
    n_row = len(row_categorical_uniques)
    fig, axes = plt.subplots(n_row, n_col, figsize=(n_col*5, n_row*5))

    for i, col_v in tqdm(enumerate(col_categorical_uniques)):
      data = self.data.loc[self.data[col]==col_v, :]
      for j, row_v in enumerate(data[row].unique()):
        sns.lineplot(data=data.loc[data[row]==row_v, :], x=x, y=y, hue=z, ax=axes[j, i], estimator=estimator, ci=ci)
        axes[j, i].set_title(f'{col}: {col_v}, {row}: {row_v}')
        axes[j, i].legend(loc='upper left')
        if is_xlim is True:
          axes[j, i].set_xlim(self.data[x].min(), self.data[x].max())
        if is_ylim is True:
          axes[j, i].set_xyim(self.data[y].min(), self.data[y].max())

In [None]:
  setting = {
      'run_name': 'test',  # run名
      'feature_dir_name': FEATURE_DIR_NAME,  # 特徴量の読み込み先ディレクトリ
      'eda_dir_name': EDA_DIR_NAME #結果出力用ディレクトリ
  }

  model_params = {
      'col': 'y',
      'row': 'x',
      'x': 'accum_minutes',
      'y': 'congestion',
      'z': 'direction',
      'is_xlim': True,
      'is_ylim': True
  }

  features = ['rawdata', 'datetime_info', 'accum_minutes']


In [None]:
run_name = 'test'

# plot_series_5axis = PlotSeries5axis(model_params, features, setting)
plot_series_5axis.train
# plot_series_5axis.run()
# plot_series_5axis.create_figures(col, row, x, y, z=None)

In [None]:
  def plot_series_4axis(self, col, x, y, z=None):
    col_categorical_uniques = self.train[col].unique()
    n_col = len(col_categorical_uniques)
    fig, axes = plt.subplots(1, n_col, figsize=(n_col*5, 5))

    for i, col_v in tqdm(enumerate(col_categorical_uniques)):
      data = self.train.loc[self.train[col]==col_v, :]
      axes[i].set_title(f'{col}: {col_v}')
      sns.lineplot(data=data, x=x, y=y, hue=z, ax=axes[i], estimator=estimator, ci=ci)
      axes[i].legend(loc='upper left')
      if is_xlim is True:
        axes[ji].set_xlim(train[x].min(), train[x].max())
      if is_ylim is True:
        axes[i].set_xyim(train[y].min(), train[y].max())
    plt.show()

  # def plot_series_3axis(self, x, y, z=None):
  #   fig, ax = plt.subplots(1, 1, figsize=(10, 10))
  #   sns.lineplot(data=self.train, x=x, y=y, hue=z, ax=ax, estimator=estimator, ci=ci)
  #   ax.legend(loc='upper left')
  #   if is_xlim is True:
  #     ax.set_xlim(train[x].min(), train[x].max())
  #   if is_ylim is True:
  #     ax.set_xyim(train[y].min(), train[y].max())
  #   plt.show()

  # def plot_series_3axis_multiple_figs(self, x, y, z):
  #   z_uniques = self.train[z].unique()
  #   n_z = len(z_uniques)
  #   fig, axes = plt.subplots(1, n_z, figsize=(5*n_z, 5))

  #   for i, v in enumerate(z_uniques):
  #     sns.lineplot(data=self.train.loc[self.train[z]==v, :], x=x, y=y, ax=axes[i], estimator=estimator, ci=ci)
  #     axes[i].legend(loc='upper left')
  #     if is_xlim is True:
  #       axes[i].set_xlim(train[x].min(), train[x].max())
  #     if is_ylim is True:
  #       axes[i].set_xyim(train[y].min(), train[y].max())
  #   plt.show()

In [None]:
plot_series = PlotSeries(train, test)
# plot_series.train = train.query('accum_minutes <= 40')

In [None]:
plot_series.plot_series_4axis(col, x, y, z)

In [None]:
# 時間軸のグラフ
col = 'month'
row = 'day'
x = 'accum_minutes'
y = 'congestion'
z = 'direction'

# plot_series.plot_series_5axis(col, row, x, y, z)
plot_series.plot_series_5axis(col, row, x, y)

In [None]:
# 座標軸のグラフ
col = 'x'
row = 'y'
x = 'accum_minutes'
y = 'congestion'
z = 'direction'

plot_series.plot_series_5axis(col, row, x, y, z)

In [None]:
# 時系列グラフの変換
# カテゴリごとに前回のデータ取得
# 月、週、日→1~3回平均、1階~3階差分

In [None]:
from datetime import timedelta

i = 1

train['diff_time'] = train['time'] + timedelta(days=i)

train['time_categorical'] = train['time'].map(lambda x: str(x)+ '_') + train['x'].map(lambda x: str(x) + '_') + train['y'].map(lambda x: str(x) + '_') + train['direction']
train['difftime_categorical'] = train['diff_time'].map(lambda x: str(x)+ '_') + train['x'].map(lambda x: str(x) + '_') + train['y'].map(lambda x: str(x) + '_') + train['direction']

train[['time_categorical', 'congestion']]
df = pd.merge(train[['time_categorical', 'congestion']], train[['difftime_categorical', 'congestion']], left_on='time_categorical', right_on='difftime_categorical')


self.train[f'diff_{i}days'] = df['congestion_y']

In [None]:
i = 1
train['time'] = pd.to_datetime(train['time'])
train['diff'] = train['time'] + timedelta(days=i)
train['time_categorical'] = train['time'].map(lambda x: str(x)+ '_') + train['x'].map(lambda x: str(x) + '_') + train['y'].map(lambda x: str(x) + '_') + train['direction']
train['diff_categorical'] = train['diff'].map(lambda x: str(x)+ '_') + train['x'].map(lambda x: str(x) + '_') + train['y'].map(lambda x: str(x) + '_') + train['direction']
df = pd.merge(train[['time_categorical', 'congestion']], train[['diff_categorical', 'congestion']], left_on='time_categorical', right_on='diff_categorical', how='left')

# test['time'] = pd.to_datetime(test['time'])
# test['time_categorical'] = test['time'].map(lambda x: str(x)+ '_') + test['x'].map(lambda x: str(x) + '_') + test['y'].map(lambda x: str(x) + '_') + test['direction']
# df = pd.merge(test[['time_categorical']], train[['diff_categorical', 'congestion']], left_on='time_categorical', right_on='diff_categorical', how='left')

In [None]:
# len(train['time_categorical'].unique())
len(df)