## データの読込

In [1]:
import pandas as pd
df = pd.read_csv("vgsales-12-4-2019.csv")
df.head(3)

Unnamed: 0,Rank,Name,basename,Genre,ESRB_Rating,Platform,Publisher,Developer,VGChartz_Score,Critic_Score,...,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Year,Last_Update,url,status,Vgchartzscore,img_url
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,,7.7,...,,,,,2006.0,,http://www.vgchartz.com/game/2667/wii-sports/?...,1,,/games/boxart/full_2258645AmericaFrontccc.jpg
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,,10.0,...,,,,,1985.0,,http://www.vgchartz.com/game/6455/super-mario-...,1,,/games/boxart/8972270ccc.jpg
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,,8.2,...,,,,,2008.0,11th Apr 18,http://www.vgchartz.com/game/6968/mario-kart-w...,1,8.7,/games/boxart/full_8932480AmericaFrontccc.jpg


## データ拡張

## TGAN

In [4]:
import os
import time
import random
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
from natsort import natsorted 

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow
from tgan.data import load_demo_data
from tgan.model import TGANModel

import warnings
warnings.simplefilter('ignore')

ModuleNotFoundError: No module named 'tgan'

In [None]:
# シード値の固定
SEED = 42

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    df.dropna(subset=["Global_Sales"]).drop(["Global_Sales",  "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales"], axis=1), 
                                                    df.dropna(subset=["Global_Sales"])["Global_Sales"],  
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    random_state=SEED
                                                    ) 

In [None]:
### パイプライン

# 数値データカラム名を取得
number_columns = X_train.select_dtypes(include="number").columns
number_columns = list(number_columns)

# カテゴリデータカラム名を取得
category_columns = X_train.select_dtypes(include="object").columns
category_columns = list(category_columns)

# カテゴリデータカラムの各カラムのカテゴリーの数を取得
category_unique_num = X_train.select_dtypes(include="object").nunique()

# 数値データ用の変換
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]
)

# カテゴリーのエンコーディング法則を指定する
ordinal_all_cols_mapping = []

for column in df.select_dtypes(include="object"):
    ordinal_one_cols_mapping = []
    for category in natsorted(X_train[column].unique()):
        ordinal_one_cols_mapping.append(category)

    ordinal_all_cols_mapping.append(ordinal_one_cols_mapping)


# カテゴリー数が少ないカテゴリーデータ用の変換
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(
                handle_unknown = 'use_encoded_value', # 未知数をunknown valueに置き換える設定
                unknown_value = -1,
                categories = ordinal_all_cols_mapping
            )
        )
    ]
)

# ColumnTransformerの作成
columns_transformers = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, number_columns),
        ('category', categorical_transformer, category_columns),
    ]
)

# カラム変換用のパイプラインの作成
transformer = Pipeline(
    [
        ("columns_transformers", columns_transformers),
    ]
)

# パイプラインの作成
pipe = Pipeline(
    [
        ("columns_transformers", columns_transformers),
        ('model', lgb.LGBMRegressor(random_state=42))
     ]
)

In [None]:
# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X_train["Genre"]

best_params, history = {}, []

cv_result = []

pred_df = pd.DataFrame()

for i, (train_index, test_index) in enumerate(gkf.split(X_train, y_train, groups)):
    X_train_gkf, X_test_gkf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_gkf, y_test_gkf = y_train.iloc[train_index], y_train.iloc[test_index]

    # 学習、推論
    pipe.fit(X_train_gkf, y_train_gkf)

    y_pred = pipe.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result.append(rmse)

    pred = pipe.predict(X_test)

    pred_df[i] = pred

print("RMSE:", cv_result)
print("RMSE:", np.mean(cv_result))

In [None]:
pred_df.head()

In [None]:
rmse_test = mean_squared_error(y_test, pred_df.mean(axis=1), squared=False)
print(rmse_test)

## TGAN

In [None]:
all_columns = X_test.columns.tolist()

### Nullのみで削除されるカラムを削除
print(len(all_columns))

for column in ["VGChartz_Score", "Total_Shipped"]:
    all_columns.remove(column)

print(len(all_columns))

In [None]:
X_train_tf = transformer.fit_transform(X_train)
X_train_tf = pd.DataFrame(
                          X_train_tf,
                          columns = all_columns
                        )

X_test_tf = transformer.fit_transform(X_test)
X_test_tf = pd.DataFrame(
                          X_test_tf,
                          columns = all_columns
                        )

In [None]:
train_df = pd.concat([X_train_tf, y_train], axis=1)

In [None]:
continuous_value_columns = ["Rank", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "Year"]

tgan = TGANModel(
    continuous_value_columns,
    max_epoch=5,
    steps_per_epoch=5000,
    batch_size=100,
    )

tgan.fit(train_df)

In [None]:
num_samples = 50000

train_sample = tgan.sample(num_samples)

print(train_sample.shape)

train_sample.head(3)

In [None]:
train_sample_fillna = train_sample.astype(float)

train_sample_fillna = train_sample_fillna.fillna(method='ffill')

train_sample_fillna = train_sample_fillna.dropna(axis=0)

train_sample_fillna = train_sample_fillna.reset_index(drop=True)

X_train_sample_fillna,  y_train_sample_fillna = train_sample_fillna.drop(["Global_Sales"], axis=1), train_sample_fillna["Global_Sales"]

In [None]:
print(X_train[all_columns].shape, y_train.shape)

print(X_train_sample_fillna.shape, y_train_sample_fillna.shape)

In [None]:
X_train_comb = pd.concat([
           X_train_tf[all_columns],
           X_train_sample_fillna
]).reset_index(drop=True)

y_train_comb = pd.concat([
           y_train,
           y_train_sample_fillna
]).reset_index(drop=True)

In [None]:
# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X_train_comb["Genre"]

cv_result_tgan = []

pred_df = pd.DataFrame()

model = lgb.LGBMRegressor(random_state=42)

for i, (train_index, test_index) in enumerate(gkf.split(X_train_comb, y_train_comb, groups)):
    X_train_gkf, X_test_gkf = X_train_comb.iloc[train_index], X_train_comb.iloc[test_index]
    y_train_gkf, y_test_gkf = y_train_comb.iloc[train_index], y_train_comb.iloc[test_index]

    # 学習、推論
    model.fit(X_train_gkf, y_train_gkf)

    y_pred = model.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result_tgan.append(rmse)

    pred = pipe.predict(X_test)

    pred_df[i] = pred

print("RMSE:", cv_result_tgan)
print("RMSE:", np.mean(cv_result_tgan))

In [None]:
rmse_test_tgan = mean_squared_error(y_test, pred_df.mean(axis=1), squared=False)
print(rmse_test_tgan)

In [None]:
print("RMSE:", round(rmse_test, 3))
print("T-GAN によるデータ拡張 RMSE:", round(rmse_test_tgan, 3))