In [259]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
import lightgbm as lgb

In [260]:
# 関数の定義

def process_data(X):
    """
    is_holiday : 祝日かを表すフラグ
    """
    X['is_holiday'] = ~X['description'].isna()
    X = X.drop(columns=['holiday_date', 'description'])
    
    
    """
    'match_date' と 'kick_off_time' から、'year' 'month' 'day' 'kick_off_time' を作成
    元の 'match_date' と 'kick_off_time' 列は削除。
    """
    X['match_date'] = pd.to_datetime(X['match_date'])
    X['year'] = X['match_date'].dt.year
    X['month'] = X['match_date'].dt.month
    X['day'] = X['match_date'].dt.day
    X['weekday'] = X['match_date'].dt.day_name()
    X['kick_off_hour'] = X['kick_off_time'].str.split(':').str[0].astype(int)
    X = X.drop(['match_date', 'kick_off_time'], axis=1)
    
    
    """
    天気
    晴・曇・雨の他に、晴れのち雨とかが多すぎる
    晴のち雨は雨
    """
    
    X['weather'] = X['weather'].apply(lambda x: '悪天候' if any(y in x for y in ['雨', '雪', '霧']) else ('曇' if '曇' in x else x))
    
    
    """
    
    """
    X['broadcasters_count'] = X['broadcasters'].apply(lambda x: len(x.split('/')))
    # X['main_broadcaster'] = X['broadcasters'].apply(lambda x: x.split('/')[0])
    
    
    return X


In [261]:
# データセットの読み込み
train_data = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')
venue_information = pd.read_csv('datasets/venue_information.csv')
match_reports = pd.read_csv('datasets/match_reports.csv')
holidays_in_japan = pd.read_csv('datasets/holidays_in_japan.csv')

# データの前処理
train_data = pd.merge(train_data, venue_information, on='venue', how="inner", validate="many_to_many")
train_data = pd.merge(train_data, holidays_in_japan, left_on='match_date', right_on='holiday_date', how="left", validate="many_to_many")
train_data = process_data(train_data)
train_data['attendance_rate'] = train_data['attendance'] / train_data['capacity']
# train_data = train_data[train_data['year'] >= 2012]


test_data = pd.merge(test_data, venue_information, on='venue', how="inner", validate="many_to_many")
test_data = pd.merge(test_data, holidays_in_japan, left_on='match_date', right_on='holiday_date', how="left", validate="many_to_many")
test_data = process_data(test_data)

print(train_data.columns)


Index(['id', 'section', 'round', 'home_team', 'away_team', 'venue', 'weather',
       'temperature', 'humidity', 'broadcasters', 'attendance', 'capacity',
       'address', 'is_holiday', 'year', 'month', 'day', 'weekday',
       'kick_off_hour', 'broadcasters_count', 'attendance_rate'],
      dtype='object')


In [262]:
# 使用しない特徴量
ignore_features = ['id', 'year', 'round', 'temperature', 'humidity', 'broadcasters', 'address', 'attendance', 'attendance_rate', 'day']

# 特徴量を分類
categorical_features = []
numeric_features = []

for f in list(train_data.columns):
    if f in ignore_features:
        continue
    if train_data[f].dtype in ['object', 'bool']:
        categorical_features.append(f)
    else:
        numeric_features.append(f)
        
features = numeric_features + categorical_features


In [263]:
# Xとyに分ける
X = train_data[features]
y = train_data['attendance_rate']

X.head()

Unnamed: 0,capacity,month,kick_off_hour,broadcasters_count,section,home_team,away_team,venue,weather,is_holiday,weekday
0,21000,3,16,2,第1節,G大阪,浦和,万博記念競技場,晴,False,Saturday
1,15859,3,13,3,第1節,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,False,Sunday
2,47851,3,13,3,第1節,FC東京,大分,味の素スタジアム,晴,False,Sunday
3,51697,3,14,1,第1節,磐田,福岡,静岡スタジアムエコパ,晴,False,Sunday
4,20223,3,14,4,第1節,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,False,Sunday


In [264]:
# 入力特徴量の一覧
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   capacity            3672 non-null   int64 
 1   month               3672 non-null   int32 
 2   kick_off_hour       3672 non-null   int64 
 3   broadcasters_count  3672 non-null   int64 
 4   section             3672 non-null   object
 5   home_team           3672 non-null   object
 6   away_team           3672 non-null   object
 7   venue               3672 non-null   object
 8   weather             3672 non-null   object
 9   is_holiday          3672 non-null   bool  
 10  weekday             3672 non-null   object
dtypes: bool(1), int32(1), int64(3), object(6)
memory usage: 276.2+ KB


In [265]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [266]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [267]:
# パイプラインの作成
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('regressor', XGBRegressor(random_state=42))], memory=None)

# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('regressor', RandomForestRegressor(random_state=42))], memory=None)

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', lgb.LGBMRegressor(random_state=42))], memory=None)


In [268]:
# ハイパーパラメータのグリッドを設定
param_grid = {
    'regressor__num_leaves': [31, 50, 70],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__n_estimators': [100, 200, 500]
}

# GridSearchCVを使用してハイパーパラメータの調整
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# トレーニング
grid_search.fit(X_train, y_train)

# 最適なハイパーパラメータ
print(f'Best parameters found: {grid_search.best_params_}')


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 336
[LightGBM] [Info] Number of data points in the train set: 1958, number of used features: 133
[LightGBM] [Info] Start training from score 0.608057
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 332
[LightGBM] [Info] Number of data points in the train set: 1958, number of used features: 133
[LightGBM] [Info] Start training from score 0.599839
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000358 seconds.
You can set `force_col_wi

In [269]:
# モデルの訓練
# model.fit(X_train, y_train)
# y_pred = model.predict(X_val)


# 最適なモデルで予測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)



In [270]:
# 動員率を動員数に戻す
result_val = pd.concat([X_val, y_val, pd.Series(y_pred, index=y_val.index, name='y_pred')], axis=1)
result_val['y_val_num'] = result_val['capacity'] * result_val['attendance_rate']
result_val['y_pred_num'] = result_val['capacity'] * result_val['y_pred']
y_val = result_val['y_val_num']
y_pred = result_val['y_pred_num']


In [271]:

# モデルの評価
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print('validate RMSE', rmse)

validate RMSE 4082.062933588128


In [272]:
error = y_pred - y_val
abs_error = abs(error)

abs_error = abs_error.sort_values()

In [273]:
X_test = test_data[features]
# match_dateをdatetime型に変換
X_test.head()


Unnamed: 0,capacity,month,kick_off_hour,broadcasters_count,section,home_team,away_team,venue,weather,is_holiday,weekday
0,24130,2,20,1,第1節,鳥栖,神戸,ベストアメニティスタジアム,晴,False,Friday
1,47851,2,14,3,第1節,FC東京,浦和,味の素スタジアム,晴,False,Saturday
2,36894,2,14,3,第1節,広島,札幌,エディオンスタジアム広島,晴,False,Saturday
3,39694,2,14,1,第1節,Ｇ大阪,名古屋,パナソニック スタジアム 吹田,曇,False,Saturday
4,15380,2,16,1,第1節,湘南,長崎,Shonan BMW スタジアム平塚,晴,False,Saturday


In [275]:
# モデルの予測
y_test = best_model.predict(X_test)

In [276]:
# 動員率を動員数に戻す
result_test = pd.concat([X_test, pd.Series(y_test, index=X_test.index, name='prediction_rate')], axis=1)
result_test['prediction_num'] = result_test['capacity'] * result_test['prediction_rate']
y_test = result_test['prediction_num']


In [277]:
# 予測結果をデータフレームにする
results = pd.DataFrame({'id': test_data['id'], 'prediction': y_test})
results.to_csv('outputs/submission.csv', index=False, header=False)