In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [4]:
# データ準備
df = pd.read_csv('document/vgsales.csv')
df[['Publisher']] = df[['Publisher']].fillna('NaN')
df.drop("Name", axis=1, inplace=True)
# Yearカラムの欠損をKNNで代入する
target = "Year"
X = df.drop(target, axis=1)
y = df[target]
# 数値カラムのリスト取得(標準化の対象)
num_cols = X.select_dtypes(include=np.number).columns.to_list()
# ダミー変数
X = pd.get_dummies(X, drop_first=True)
# 標準化
X[num_cols] = StandardScaler().fit_transform(X[num_cols])
# YearがNaNのデータはテストデータ，そうでなければ学習データ
test_indexes = df[df['Year'].isna()].index
train_indexes = df[~df['Year'].isna()].index
X_train, X_test = X.iloc[train_indexes], X.iloc[test_indexes]
y_train, y_test = y.iloc[train_indexes], y.iloc[test_indexes]

In [5]:
# kNNImputerを使う
imputer = KNNImputer(n_neighbors=3)
imputer.set_output(transform='pandas')
# ダミー変数
df = pd.get_dummies(df, drop_first=True)
# 標準化
df[num_cols] = StandardScaler().fit_transform(df[num_cols])
df_imputed = imputer.fit_transform(df)

In [6]:
df_imputed

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Platform_3DO,Platform_3DS,Platform_DC,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,-1.732076,2006.0,50.480508,57.136930,11.938058,44.606085,52.864025,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.731867,1985.0,35.284437,6.794188,21.767296,3.828224,25.532503,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.731659,2008.0,19.084273,25.197785,12.002724,17.297115,22.690025,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.731450,2009.0,18.961823,21.497277,10.353740,15.441165,20.876498,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.731241,1996.0,13.476053,17.302048,32.792857,5.047848,19.828254,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16593,1.731197,2002.0,-0.311841,-0.290207,-0.251492,-0.254864,-0.339194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16594,1.731406,2003.0,-0.311841,-0.290207,-0.251492,-0.254864,-0.339194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16595,1.731615,2008.0,-0.324086,-0.290207,-0.251492,-0.254864,-0.339194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16596,1.731823,2010.0,-0.324086,-0.270418,-0.251492,-0.254864,-0.339194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
