In [48]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [50]:
# データセットの読み込み
train_data = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')
venue_information = pd.read_csv('datasets/venue_information.csv')
match_reports = pd.read_csv('datasets/match_reports.csv')

In [51]:


def process_match_data(X):
    """
    与えられたデータフレームに対して、日付および時間の情報を抽出し、新しい列として追加します。
    元の 'match_date' と 'kick_off_time' 列は削除されます。
    
    Parameters:
    X (pd.DataFrame): 'match_date' 列が datetime 型、'kick_off_time' 列が文字列型のデータフレーム。
    
    Returns:
    pd.DataFrame: 新しい列 ('year', 'month', 'day', 'kick_off_hour') が追加され、元の列が削除されたデータフレーム。
    """
    X['year'] = X['match_date'].dt.year
    X['month'] = X['match_date'].dt.month
    X['day'] = X['match_date'].dt.day
    X['kick_off_hour'] = X['kick_off_time'].str.split(':').str[0].astype(int)
    X = X.drop(['match_date', 'kick_off_time'], axis=1)
    return X


In [52]:
features = ['section', 'round', 'home_team', 'away_team', 'venue', 'weather', 'temperature', 'humidity', 'broadcasters', 'month', 'day', 'kick_off_hour']

In [53]:
# カテゴリカルデータのエンコーディング
categorical_features = ['section', 'round', 'home_team', 'away_team', 'venue', 'weather', 'broadcasters']
numeric_features = ['temperature', 'humidity', 'month', 'day', 'kick_off_hour']

In [54]:

train_data['match_date'] = pd.to_datetime(train_data['match_date'])
train_data = process_match_data(train_data)

X = train_data[features]
y = train_data['attendance']

# match_dateをdatetime型に変換
X.head()



Unnamed: 0,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,month,day,kick_off_hour
0,第1節,第1日,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),3,4,16
1,第1節,第2日,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),3,5,13
2,第1節,第2日,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),3,5,13
3,第1節,第2日,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,3,5,14
4,第1節,第2日,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),3,5,14


In [55]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   section        3672 non-null   object 
 1   round          3672 non-null   object 
 2   home_team      3672 non-null   object 
 3   away_team      3672 non-null   object 
 4   venue          3672 non-null   object 
 5   weather        3672 non-null   object 
 6   temperature    3672 non-null   float64
 7   humidity       3672 non-null   int64  
 8   broadcasters   3672 non-null   object 
 9   month          3672 non-null   int32  
 10  day            3672 non-null   int32  
 11  kick_off_hour  3672 non-null   int64  
dtypes: float64(1), int32(2), int64(2), object(7)
memory usage: 315.7+ KB


In [56]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [57]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# パイプラインの作成
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))], memory=None)

In [59]:
# モデルの訓練
model.fit(X_train, y_train)

# モデルの予測
y_pred = model.predict(X_val)

# モデルの評価
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

In [60]:
print('validate RMSE', rmse)

validate RMSE 4546.009604240493


In [61]:
error = y_pred - y_val
abs_error = abs(error)

abs_error = abs_error.sort_values()

In [62]:
test_data['match_date'] = pd.to_datetime(test_data['match_date'])
test_data = process_match_data(test_data)
test_features = ['section', 'round', 'home_team', 'away_team', 'venue', 'weather', 'temperature', 'humidity', 'broadcasters', 'month', 'day', 'kick_off_hour']
X_test = test_data[features]
# match_dateをdatetime型に変換
X_test.head()


Unnamed: 0,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,month,day,kick_off_hour
0,第1節,第1日,鳥栖,神戸,ベストアメニティスタジアム,晴,6.6,57,DAZN,2,23,20
1,第1節,第2日,FC東京,浦和,味の素スタジアム,晴,14.1,40,DAZN/NHK BS1/TOKYO MX(録),2,24,14
2,第1節,第2日,広島,札幌,エディオンスタジアム広島,晴,16.3,51,DAZN/NHK広島/NHK札幌,2,24,14
3,第1節,第2日,Ｇ大阪,名古屋,パナソニック スタジアム 吹田,曇,12.9,42,DAZN,2,24,14
4,第1節,第2日,湘南,長崎,Shonan BMW スタジアム平塚,晴,13.8,50,DAZN,2,24,16


In [63]:
# モデルの予測
y_test = model.predict(X_test)

In [64]:
# 予測結果をデータフレームにする
results = pd.DataFrame({'id': test_data['id'], 'prediction': y_test})
results.to_csv('outputs/submission.csv', index=False, header=False)