In [173]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import os
import glob

from sklearn.model_selection import train_test_split,StratifiedKFold, KFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score


# pandasの行列を省略しない
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [174]:
train_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/train.csv")
test_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/test.csv")
venue_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/venue_information.csv")
match_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/match_reports.csv")
holidays_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/holidays_in_japan.csv")
sampel_df = pd.read_csv("V:/kobayashi_kaggle/jLeague/sample_submit.csv", header=None)

In [175]:
train_df.head()

Unnamed: 0,id,match_date,kick_off_time,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,attendance
0,9190,2006-03-04,16:04,第1節,第1日,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),20916
1,9191,2006-03-05,13:00,第1節,第2日,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),14277
2,9192,2006-03-05,13:35,第1節,第2日,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),22531
3,9193,2006-03-05,14:04,第1節,第2日,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,28564
4,9194,2006-03-05,14:04,第1節,第2日,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),17199


In [176]:
def clean(df):
    # 祝日の付与
    holidays_df.rename(columns={'holiday_date': 'match_date'}, inplace=True)
    holidays_df['holiday_flag'] = 1
    df = pd.merge(df, holidays_df, on='match_date', how='left')
    df['holiday_flag'] = df['holiday_flag'].fillna(0)

    # match_dateの日付型に変換
    df['match_date'] = pd.to_datetime(df['match_date'])
    df['year'] = df['match_date'].dt.year
    df['month'] = df['match_date'].dt.month
    df['day'] = df['match_date'].dt.day
    df['day_name'] = df['match_date'].dt.day_name()

    # kickoff_timeの時間型に変換
    df['kick_off_time'] = pd.to_datetime(df['kick_off_time'], format='%H:%M')
    df['kick_off_time'] = df['kick_off_time'].dt.hour + df['kick_off_time'].dt.minute / 60

    # sectionの数値化
    df['section'] = df['section'].str.replace('第', '').str.replace('節', '').astype(int)

    # roundの数値化
    df['round'] = df['round'].str.replace('第', '').str.replace('日', '').astype(int)

    # 雨の付与
    df['rain'] = 0
    df.loc[df['weather'].str.contains('雨'), 'rain'] = 1

    # 晴れの付与
    df['sunny'] = 0
    df.loc[df['weather'].str.contains('晴'), 'sunny'] = 1

    # NHKの付与
    df['nhk'] = 0
    df.loc[df['broadcasters'].str.contains('NHK'), 'nhk'] = 1

    # broadcastersの数値化
    df['broadcasters'] = df['broadcasters'].str.count('/') + 1

    
    
    # venueの結合
    df = pd.merge(df, venue_df, on='venue', how='left')

    return df

train_clean = clean(train_df)
test_clean = clean(test_df)
train_clean.head()


Unnamed: 0,id,match_date,kick_off_time,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,attendance,description,holiday_flag,year,month,day,day_name,rain,sunny,nhk,capacity,address
0,9190,2006-03-04,16.066667,1,1,G大阪,浦和,万博記念競技場,晴,8.3,40,2,20916,,0.0,2006,3,4,Saturday,0,1,1,21000,大阪府吹田市千里万博公園5-2
1,9191,2006-03-05,13.0,1,2,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,3,14277,,0.0,2006,3,5,Sunday,0,1,0,15859,山梨県甲府市小瀬町840
2,9192,2006-03-05,13.583333,1,2,FC東京,大分,味の素スタジアム,晴,12.1,35,3,22531,,0.0,2006,3,5,Sunday,0,1,0,47851,東京都調布市西町376−3
3,9193,2006-03-05,14.066667,1,2,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,1,28564,,0.0,2006,3,5,Sunday,0,1,0,51697,静岡県袋井市愛野2300−1
4,9194,2006-03-05,14.066667,1,2,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,4,17199,,0.0,2006,3,5,Sunday,0,1,1,20223,愛知県名古屋市瑞穂区山下通5-1


In [177]:
drop_cols = ['match_date', 'day', 'address', 'id', 'description']
category_cols = ['home_team', 'away_team', 'venue', 'weather', 'day_name', 'holiday_flag']

In [178]:
# カウントエンコーディング
for col in category_cols:
    encoder = train_clean[col].value_counts()
    train_clean[col] = train_clean[col].map(encoder)
    test_clean[col] = test_clean[col].map(encoder)


# 不要なカラムの削除
train_clean = train_clean.drop(drop_cols, axis=1)
test_clean = test_clean.drop(drop_cols, axis=1)
train_clean.head()

Unnamed: 0,kick_off_time,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,attendance,holiday_flag,year,month,day_name,rain,sunny,nhk,capacity
0,16.066667,1,1,170,204,149,1828,8.3,40,2,20916,3387,2006,3,2342,0,1,1,21000
1,13.0,1,2,136,187,32,1828,12.9,28,3,14277,3387,2006,3,783,0,1,0,15859
2,13.583333,1,2,187,85,186,1828,12.1,35,3,22531,3387,2006,3,783,0,1,0,47851
3,14.066667,1,2,170,51,7,1828,11.6,42,1,28564,3387,2006,3,783,0,1,0,51697
4,14.066667,1,2,187,102,89,1828,13.1,32,4,17199,3387,2006,3,783,0,1,1,20223


In [179]:
# モデルの学習
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=False,)
test_preds = []
valid_scores = []

# yであるattendanceを取り出し、train_cleanから削除
y = train_clean['attendance']
train_clean.drop('attendance', axis=1, inplace=True)

# kfokdで学習
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean, y)):
    X_train, y_train = train_clean.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid, y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    verbose_eval = -1 
    params = {'objective': 'regression',
            'metric': 'rmse',
            'seed': 0,
            'verbose': -1,
            'num_leaves': 31,
            'min_data_in_leaf':20,
            'learning_rate': 0.01,
            'feature_fraction': 0.30,
            'max_depth': 6,}    
    model = lgb.train(params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_eval],
                    num_boost_round=3000,
                    callbacks=[lgb.early_stopping(stopping_rounds=100, 
                                # verbose=False,
                                ), 
                        lgb.log_evaluation(verbose_eval)],
                    )

    # testでのスコアを算出し、test_predsに格納
    test_pred = model.predict(test_clean, num_iteration=model.best_iteration)
    test_preds.append(test_pred)
    valid_scores.append(model.best_score['valid_1']['rmse'])

# 最終予測結果はkfoldでの平均値を採用
sampel_df[1] = np.mean(test_preds, axis=0)
print(np.mean(valid_scores))
sampel_df.to_csv("V:/kobayashi_kaggle/jLeague/subs/sub_1.csv", index=False, header=False)

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2031]	training's rmse: 2779.09	valid_1's rmse: 6298.65
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1562]	training's rmse: 3108.71	valid_1's rmse: 4362.72
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[927]	training's rmse: 3568.48	valid_1's rmse: 4594.49
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1159]	training's rmse: 3227.99	valid_1's rmse: 5003.88
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1424]	training's rmse: 3162.9	valid_1's rmse: 5400.3
5132.006646236058


In [180]:
sampel_df.head()

Unnamed: 0,0,1
0,20745,9090.022201
1,20746,30517.442994
2,20747,19430.568021
3,20748,22179.496924
4,20749,12950.233726


In [181]:
importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
# importtane順に並び替え
df_importance = df_importance.sort_values('Importance', ascending=False)
df_importance

Unnamed: 0,Feature,Importance
17,capacity,5003
8,humidity,3828
7,temperature,3392
5,venue,3130
1,section,3048
0,kick_off_time,3020
11,year,2771
12,month,1944
4,away_team,1868
6,weather,1772
