## Google Driveのマウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## データセットの準備

In [None]:
# kaggle ライブラリのインストール
!pip install kaggle

# 一時フォルダに .kaggleフォルダを作成
!mkdir ~/.kaggle

# MyDrive の kaggle.json を一時フォルダ内の .kaggleフォルダにコピー
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/

# アクセス権限の設定
!chmod 600 ~/.kaggle/kaggle.json

!mkdir ~/.kaggle

# zipファイルのダウンロード
!kaggle datasets download -d ashaheedq/video-games-sales-2019 -p /content/drive/MyDrive/kaggle

# 解凍
!unzip /content/drive/MyDrive/kaggle/video-games-sales-2019.zip -d /content/drive/MyDrive/kaggle

!rm /content/drive/MyDrive/kaggle/video-games-sales-2019.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading video-games-sales-2019.zip to /content/drive/MyDrive/kaggle
  0% 0.00/3.98M [00:00<?, ?B/s]
100% 3.98M/3.98M [00:00<00:00, 36.4MB/s]
Archive:  /content/drive/MyDrive/kaggle/video-games-sales-2019.zip
replace /content/drive/MyDrive/kaggle/vgsales-12-4-2019-short.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: None


## データの読込

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/kaggle/vgsales-12-4-2019.csv")
df.head(3)

Unnamed: 0,Rank,Name,basename,Genre,ESRB_Rating,Platform,Publisher,Developer,VGChartz_Score,Critic_Score,User_Score,Total_Shipped,Global_Sales,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Year,Last_Update,url,status,Vgchartzscore,img_url
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,,7.7,,82.86,,,,,,2006.0,,http://www.vgchartz.com/game/2667/wii-sports/?...,1,,/games/boxart/full_2258645AmericaFrontccc.jpg
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,,10.0,,40.24,,,,,,1985.0,,http://www.vgchartz.com/game/6455/super-mario-...,1,,/games/boxart/8972270ccc.jpg
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,,8.2,9.1,37.14,,,,,,2008.0,11th Apr 18,http://www.vgchartz.com/game/6968/mario-kart-w...,1,8.7,/games/boxart/full_8932480AmericaFrontccc.jpg


## 特徴量選択

In [None]:
!pip install japanize-matplotlib
!pip install xfeat

Collecting xfeat
  Downloading xfeat-0.1.1-py3-none-any.whl (39 kB)
Collecting ml-metrics
  Downloading ml_metrics-0.1.4.tar.gz (5.0 kB)
Collecting optuna>=1.3.0
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 40.9 MB/s 
Collecting cliff
  Downloading cliff-3.10.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 9.1 MB/s 
[?25hCollecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.7.5-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 59.1 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.3 MB/s 
[?25hCollecting autopage>=0.4.0
  Downloading autopage-0.4.0-py3-none-any.whl (20 kB)
Collecting cmd2>=1.0.0
  Downloading cmd2-2.3.3-py3-none-any.whl (149 kB)
[K   

In [None]:
import os
import time
import random
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
from natsort import natsorted 

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectFromModel
from xfeat import GBDTFeatureSelector

import warnings
warnings.simplefilter('ignore')

In [None]:
# シード値の固定
SEED = 42

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    df.dropna(subset=["Global_Sales"]).drop(["Global_Sales",  "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales"], axis=1), 
                                                    df.dropna(subset=["Global_Sales"])["Global_Sales"],  
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    random_state=SEED
                                                    ) 

In [None]:
### パイプライン

# 数値データカラム名を取得
number_columns = list(X_train.select_dtypes(include="number").columns)

# カテゴリデータカラム名を取得
category_columns = list(X_train.select_dtypes(include="object").columns)

# カテゴリデータカラムの各カラムのカテゴリーの数を取得
category_unique_num = X_train.select_dtypes(include="object").nunique()

# 数値データ用の変換
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]
)

# カテゴリーの数にしきい値を設けて、カテゴリー数の多いカラムと少ないカラムに分ける
thread = 10
many_kinds_category_columns = list(category_unique_num[category_unique_num >= thread].index)
few_kinds_category_columns = list(category_unique_num[category_unique_num < thread].index)

# カテゴリーのエンコーディング法則を指定する
ordinal_all_cols_mapping = []

for column in many_kinds_category_columns:
    ordinal_one_cols_mapping = []
    for category in natsorted(X_train[column].unique()):
        ordinal_one_cols_mapping.append(category)

    ordinal_all_cols_mapping.append(ordinal_one_cols_mapping)

# カテゴリー数が多いカテゴリーデータ用の変換
many_kinds_categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(
                handle_unknown = 'use_encoded_value', # 未知数をunknown valueに置き換える設定
                unknown_value = -1,
                categories = ordinal_all_cols_mapping
            )
        )
])

# カテゴリー数が少ないカテゴリーデータ用の変換
few_kinds_categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# ColumnTransformerの作成
columns_transformers = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, number_columns),
        ('many_kinds', many_kinds_categorical_transformer, many_kinds_category_columns),
        ('few_kinds', few_kinds_categorical_transformer, few_kinds_category_columns)
    ]
)

# カラム変換用のパイプラインの作成
transformer = Pipeline(
    [
        ("columns_transformers", columns_transformers),
    ]
)

### LighgGBM

In [None]:
# パイプラインの作成
pipe = Pipeline(
    [
        ("columns_transformers", columns_transformers),
        ('model', lgb.LGBMRegressor(random_state=42))
     ]
)

In [None]:
# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X_train["Genre"]

cv_result_lgbm = []

for i, (train_index, test_index) in enumerate(gkf.split(X_train, y_train, groups)):
    X_train_gkf, X_test_gkf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_gkf, y_test_gkf = y_train.iloc[train_index], y_train.iloc[test_index]

    # 学習、推論
    pipe.fit(X_train_gkf, y_train_gkf)

    y_pred = pipe.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result_lgbm.append(rmse)

print("RMSE:", cv_result_lgbm)
print("RMSE:", np.mean(cv_result_lgbm))

RMSE: [0.3381069648542431, 0.10717670046871884, 0.15152424264212594, 0.02716385550389459, 0.3365058004617997]
RMSE: 0.1920955127861564


### 特徴量選択

In [None]:
print("現状の特徴量の数", len(X_train.columns))

現状の特徴量の数 18


In [None]:
X_train_tf = transformer.fit_transform(X_train)
X_test_tf = transformer.fit_transform(X_test)

### SequentialFeatureSelector

In [None]:
# 特徴量の選択
n_features = 15

model = lgb.LGBMRegressor(random_state=42)

sfs = SequentialFeatureSelector(
    model, 
    n_features_to_select=n_features, 
    direction='forward'
)

sfs.fit(X_train_tf, y_train)
 
 # 選択された特徴量
print("元の特徴量の数", X_train_tf.shape[1], "選択された特徴量の数", X_train_tf[:, sfs.get_support()].shape[1])

# 特徴量の選択
X_train_select = X_train_tf[:, sfs.get_support()]
X_test_select = X_test_tf[:, sfs.get_support()]

元の特徴量の数 21 選択された特徴量の数 15


In [None]:
# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X_train_select[:, 0]

cv_result_sfs = []

for i, (train_index, test_index) in enumerate(gkf.split(X_train_select, y_train, groups)):
    X_train_gkf, X_test_gkf = X_train_select[train_index], X_train_select[test_index]
    y_train_gkf, y_test_gkf = y_train.iloc[train_index], y_train.iloc[test_index]

    # 学習、推論
    model.fit(X_train_gkf, y_train_gkf)

    y_pred = model.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result_sfs.append(rmse)

print("RMSE:", cv_result_sfs)
print("RMSE:", np.mean(cv_result_sfs))

RMSE: [0.21309849791068078, 0.1834027756883897, 0.20056790904927804, 0.17119539164902375, 0.23562960854119155]
RMSE: 0.20077883656771278


### SelectFromModel

In [None]:
threshold = 0.15

model = lgb.LGBMRegressor(random_state=42)

sfm = SelectFromModel(
    model, 
    threshold=threshold
)

sfm.fit(X_train_tf, y_train)
 
 # 選択された特徴量
print("元の特徴量の数", X_train_tf.shape[1], "選択された特徴量の数", X_train_tf[:, sfm.get_support()].shape[1])

# 特徴量の選択
X_train_select = X_train_tf[:, sfm.get_support()]
X_test_select = X_test_tf[:, sfm.get_support()]

元の特徴量の数 21 選択された特徴量の数 18


In [None]:
# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X_train_select[:, 0]

cv_result_sfm = []

for i, (train_index, test_index) in enumerate(gkf.split(X_train_select, y_train, groups)):
    X_train_gkf, X_test_gkf = X_train_select[train_index], X_train_select[test_index]
    y_train_gkf, y_test_gkf = y_train.iloc[train_index], y_train.iloc[test_index]

    # 学習、推論
    model.fit(X_train_gkf, y_train_gkf)

    y_pred = model.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result_sfm.append(rmse)

print("RMSE:", cv_result_sfm)
print("RMSE:", np.mean(cv_result_sfm))

RMSE: [0.22343504315865, 0.17239495669006594, 0.20376657887529404, 0.18454528436624612, 0.24466960968180426]
RMSE: 0.20576229455441206


## Xfeat Feature Selection with GBDT feature importance

In [None]:
print(len(X_train))

pd.DataFrame(X_train.isnull().sum()).T

13590


Unnamed: 0,Rank,Name,basename,Genre,ESRB_Rating,Platform,Publisher,Developer,VGChartz_Score,Critic_Score,User_Score,Total_Shipped,Year,Last_Update,url,status,Vgchartzscore,img_url
0,0,0,0,0,3911,0,0,3,13590,10599,13480,13590,29,10929,0,0,13270,0


In [None]:
### Nullのみで削除されるカラムを削除
print(len(number_columns))

for column in ["VGChartz_Score", "Total_Shipped"]:
    number_columns.remove(column)

print(len(number_columns))

8
6


In [None]:
# OneHotを考慮したの全カラム作成
all_columns = number_columns + \
                        many_kinds_category_columns + \
                        pipe["columns_transformers"].transformers_[2][1]["onehot"].get_feature_names(few_kinds_category_columns).tolist()

print(len(number_columns), len(many_kinds_category_columns), len(pipe["columns_transformers"].transformers_[2][1]["onehot"].get_feature_names(few_kinds_category_columns).tolist()))
print(len(all_columns))

6 9 6
21


In [None]:
X_train_tf_pd = pd.DataFrame(
                X_train_tf,
                columns=all_columns
            )

y_train_pd = pd.DataFrame(y_train).reset_index(drop=True)

df_tf_pd = pd.concat([X_train_tf_pd, y_train_pd], axis=1)

In [None]:
params = {
    "objective": "regression",
    "seed": SEED,
}
fit_kwargs = {
    "num_boost_round": 10,
}

selector = GBDTFeatureSelector(
    input_cols=df_tf_pd.columns.tolist(),
    target_col="Global_Sales",
    threshold=0.5,
    lgbm_params=params,
    lgbm_fit_kwargs=fit_kwargs,
)

# 選択した特徴量
print("Selected columns:", selector._selected_cols)

# 特徴量選択
df_selected = selector.fit_transform(df_tf_pd)

Selected columns: ['Global_Sales', 'Rank', 'User_Score', 'Year', 'ESRB_Rating_T', 'ESRB_Rating_RP', 'ESRB_Rating_M', 'ESRB_Rating_EC', 'ESRB_Rating_E10', 'ESRB_Rating_E', 'img_url']


In [None]:
X = df_selected.drop("Global_Sales", axis=1)
y = df_selected["Global_Sales"]

# 学習・推論
gkf = GroupKFold(n_splits=5)

groups = X.iloc[:, 0]

cv_result_xfeat = []

for i, (train_index, test_index) in enumerate(gkf.split(X, y, groups)):
    X_train_gkf, X_test_gkf = X.iloc[train_index], X.iloc[test_index]
    y_train_gkf, y_test_gkf = y.iloc[train_index], y.iloc[test_index]

    # 学習、推論
    model.fit(X_train_gkf, y_train_gkf)

    y_pred = model.predict(X_test_gkf)

    rmse = mean_squared_error(y_test_gkf, y_pred, squared=False)
    cv_result_xfeat.append(rmse)

print("RMSE:", cv_result_xfeat)
print("RMSE:", np.mean(cv_result_xfeat))

RMSE: [0.19143688774135434, 0.1522579565915211, 0.20600288376790432, 0.19758492885001533, 0.25000689179730856]
RMSE: 0.19945790974962074


In [None]:
print("RMSE:", round(np.mean(cv_result_lgbm),3))
print("SequentialFeatureSelector RMSE:", round(np.mean(cv_result_sfs),3))
print("SelectFromModel RMSE:", round(np.mean(cv_result_sfm),3))
print("Xfeat GBDTFeatureSelector RMSE:", round(np.mean(cv_result_xfeat),3))

RMSE: 0.192
SequentialFeatureSelector RMSE: 0.201
SelectFromModel RMSE: 0.206
Xfeat GBDTFeatureSelector RMSE: 0.199
