In [106]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [107]:
# 関数の定義

def process_data(X):
    """
    is_holiday : 祝日かを表すフラグ
    """
    X['is_holiday'] = ~X['description'].isna()
    X = X.drop(columns=['holiday_date', 'description'])
    
    
    """
    'match_date' と 'kick_off_time' から、'year' 'month' 'day' 'kick_off_time' を作成
    元の 'match_date' と 'kick_off_time' 列は削除。
    """
    X['match_date'] = pd.to_datetime(X['match_date'])
    X['year'] = X['match_date'].dt.year
    X['month'] = X['match_date'].dt.month
    X['day'] = X['match_date'].dt.day
    X['weekday'] = X['match_date'].dt.day_name()
    X['kick_off_hour'] = X['kick_off_time'].str.split(':').str[0].astype(int)
    X = X.drop(['match_date', 'kick_off_time'], axis=1)
    
    
    """
    天気
    晴・曇・雨の他に、晴れのち雨とかが多すぎる
    晴のち雨は雨
    """
    
    X['weather'] = X['weather'].apply(lambda x: '悪天候' if any(y in x for y in ['雨', '雪', '霧']) else ('曇' if '曇' in x else x))
    
    
    
    return X


In [108]:
# データセットの読み込み
train_data = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')
venue_information = pd.read_csv('datasets/venue_information.csv')
match_reports = pd.read_csv('datasets/match_reports.csv')
holidays_in_japan = pd.read_csv('datasets/holidays_in_japan.csv')

# データの前処理
train_data = pd.merge(train_data, venue_information, on='venue', how="inner", validate="many_to_many")
train_data = pd.merge(train_data, holidays_in_japan, left_on='match_date', right_on='holiday_date', how="left", validate="many_to_many")
train_data = process_data(train_data)
train_data['attendance_rate'] = train_data['attendance'] / train_data['capacity']


test_data = pd.merge(test_data, venue_information, on='venue', how="inner", validate="many_to_many")
test_data = pd.merge(test_data, holidays_in_japan, left_on='match_date', right_on='holiday_date', how="left", validate="many_to_many")
test_data = process_data(test_data)


In [109]:
# 使用しない特徴量
ignore_features = ['id', 'year', 'round',  'attendance', 'attendance_rate']

# 特徴量を分類
categorical_features = []
numeric_features = []

for f in list(train_data.columns):
    if f in ignore_features:
        continue
    if train_data[f].dtype in ['object', 'bool']:
        categorical_features.append(f)
    else:
        numeric_features.append(f)
        
features = numeric_features + categorical_features


In [110]:
# Xとyに分ける
X = train_data[features]
y = train_data['attendance_rate']

X.head()

Unnamed: 0,temperature,humidity,capacity,month,day,kick_off_hour,section,home_team,away_team,venue,weather,broadcasters,address,is_holiday,weekday
0,8.3,40,21000,3,4,16,第1節,G大阪,浦和,万博記念競技場,晴,NHK総合/J SPORTS(録),大阪府吹田市千里万博公園5-2,False,Saturday
1,12.9,28,15859,3,5,13,第1節,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,山梨放送/テレビ静岡(録)/J SPORTS(録),山梨県甲府市小瀬町840,False,Sunday
2,12.1,35,47851,3,5,13,第1節,FC東京,大分,味の素スタジアム,晴,BS-i/MXテレビ(録)/J SPORTS(録),東京都調布市西町376−3,False,Sunday
3,11.6,42,51697,3,5,14,第1節,磐田,福岡,静岡スタジアムエコパ,晴,J SPORTS,静岡県袋井市愛野2300−1,False,Sunday
4,13.1,32,20223,3,5,14,第1節,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),愛知県名古屋市瑞穂区山下通5-1,False,Sunday


In [111]:
# 入力特徴量の一覧
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   temperature    3672 non-null   float64
 1   humidity       3672 non-null   int64  
 2   capacity       3672 non-null   int64  
 3   month          3672 non-null   int32  
 4   day            3672 non-null   int32  
 5   kick_off_hour  3672 non-null   int64  
 6   section        3672 non-null   object 
 7   home_team      3672 non-null   object 
 8   away_team      3672 non-null   object 
 9   venue          3672 non-null   object 
 10  weather        3672 non-null   object 
 11  broadcasters   3672 non-null   object 
 12  address        3672 non-null   object 
 13  is_holiday     3672 non-null   bool   
 14  weekday        3672 non-null   object 
dtypes: bool(1), float64(1), int32(2), int64(3), object(8)
memory usage: 376.7+ KB


In [112]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [113]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [114]:
# パイプラインの作成
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', XGBRegressor(random_state=42))], memory=None)

In [115]:
# モデルの訓練
model.fit(X_train, y_train)

# モデルの予測
y_pred = model.predict(X_val)


In [116]:
# 動員率を動員数に戻す
result_val = pd.concat([X_val, y_val, pd.Series(y_pred, index=y_val.index, name='y_pred')], axis=1)
result_val['y_val_num'] = result_val['capacity'] * result_val['attendance_rate']
result_val['y_pred_num'] = result_val['capacity'] * result_val['y_pred']
y_val = result_val['y_val_num']
y_pred = result_val['y_pred_num']


In [117]:

# モデルの評価
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print('validate RMSE', rmse)

validate RMSE 4036.36122856008


In [118]:
error = y_pred - y_val
abs_error = abs(error)

abs_error = abs_error.sort_values()

In [119]:
X_test = test_data[features]
# match_dateをdatetime型に変換
X_test.head()


Unnamed: 0,temperature,humidity,capacity,month,day,kick_off_hour,section,home_team,away_team,venue,weather,broadcasters,address,is_holiday,weekday
0,6.6,57,24130,2,23,20,第1節,鳥栖,神戸,ベストアメニティスタジアム,晴,DAZN,佐賀県鳥栖市京町812,False,Friday
1,14.1,40,47851,2,24,14,第1節,FC東京,浦和,味の素スタジアム,晴,DAZN/NHK BS1/TOKYO MX(録),東京都調布市西町376−3,False,Saturday
2,16.3,51,36894,2,24,14,第1節,広島,札幌,エディオンスタジアム広島,晴,DAZN/NHK広島/NHK札幌,広島県広島市安佐南区大塚西5丁目1−1,False,Saturday
3,12.9,42,39694,2,24,14,第1節,Ｇ大阪,名古屋,パナソニック スタジアム 吹田,曇,DAZN,大阪府吹田市千里万博公園3−3,False,Saturday
4,13.8,50,15380,2,24,16,第1節,湘南,長崎,Shonan BMW スタジアム平塚,晴,DAZN,神奈川県平塚市大原1−1,False,Saturday


In [120]:
# モデルの予測
y_test = model.predict(X_test)

In [121]:
# 動員率を動員数に戻す
result_test = pd.concat([X_test, pd.Series(y_test, index=X_test.index, name='prediction_rate')], axis=1)
result_test['prediction_num'] = result_test['capacity'] * result_test['prediction_rate']
y_test = result_test['prediction_num']


In [122]:
# 予測結果をデータフレームにする
results = pd.DataFrame({'id': test_data['id'], 'prediction': y_test})
results.to_csv('outputs/submission.csv', index=False, header=False)