In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
FILEDIR = Path('../input')

In [5]:
# 提出用ファイルを取得
sub = pd.read_csv(FILEDIR / 'MSampleSubmissionStage1.csv', usecols=['ID'])
id_splited = sub['ID'].str.split('_', expand=True).astype(int).rename(columns={0: 'Season', 1: 'Team1', 2: 'Team2'})
sub = pd.concat([sub, id_splited], axis=1).set_index(['Season', 'Team1', 'Team2']).sort_index()

In [6]:
# シーズン毎の出場チームを抽出
tourney_teams = {}
tourney_teams_all = set()
for season in sub.index.get_level_values('Season').drop_duplicates():
    tourney_teams[season] = set()
    tourney_teams[season].update(sub.loc[season].index.get_level_values('Team1'))
    tourney_teams[season].update(sub.loc[season].index.get_level_values('Team2'))
    tourney_teams_all.update(tourney_teams[season])
{k: len(v) for k, v in tourney_teams.items()}

{2015: 68, 2016: 68, 2017: 68, 2018: 68, 2019: 68}

In [9]:
# 所属カンファレンス情報を取得
conferences = pd.read_csv(FILEDIR / 'MTeamConferences.csv')
conferences = pd.concat(
    [conferences.query('Season == @season and TeamID in @teams') for season, teams in tourney_teams.items()])
conferences = conferences.set_index(['Season', 'TeamID']).sort_index()

In [10]:
# コーチ名を取得
coaches = pd.read_csv(FILEDIR / 'MTeamCoaches.csv')
coaches = pd.concat(
    [coaches.query('Season == @season and TeamID in @team') for season, team in tourney_teams.items()])
coaches = coaches[coaches['LastDayNum'] == 154].set_index(['Season', 'TeamID']).sort_index()[['CoachName']]

In [11]:
# NCAAの初回出場年を取得し、初回出場年から現在までの年数を計算
teams = pd.read_csv(FILEDIR / 'MTeams.csv', usecols=['TeamID', 'FirstD1Season'])
teams['FirstD1Season'] = 2020 - teams['FirstD1Season']
teams = pd.concat(
    [teams.query('TeamID in @team').assign(Season=season) for season, team in tourney_teams.items()])
teams = teams.set_index(['Season', 'TeamID']).sort_index()

In [12]:
# 各シーズンのシードを取得
seeds = pd.read_csv(FILEDIR / 'MNCAATourneySeeds.csv')
seeds = pd.concat(
    [seeds.query('Season == @season and TeamID in @teams') for season, teams in tourney_teams.items()])
seeds = seeds.set_index(['Season', 'TeamID']).sort_index()
seeds['Region'] = seeds['Seed'].str[0]
seeds['Number'] = seeds['Seed'].str[1:3].astype(int)
del seeds['Seed']

In [13]:
# レギュラーシーズンの累計得点と累計失点を取得
regular = pd.read_csv(FILEDIR / 'MRegularSeasonDetailedResults.csv')
regular = regular.drop(columns=['DayNum', 'LTeamID'])
regular = pd.concat(
    [regular.query('Season == @season and WTeamID in @teams') for season, teams in tourney_teams.items()])
regular = regular.groupby(['Season', 'WTeamID']).sum()
regular = regular.rename_axis(index=['Season', 'TeamID'])

In [16]:
# 上記取得データをindexで結合
ctcsr = pd.concat([coaches, teams, conferences, seeds, regular], axis=1)

In [17]:
# NCAAMトーナメントの勝敗結果を取得
result = pd.read_csv(FILEDIR / 'MNCAATourneyCompactResults.csv')
result = result[result['Season'] >= 2015].set_index(['Season', 'WTeamID', 'LTeamID'])

In [18]:
# 各種データと勝敗結果を結合
merged_teams = pd.concat(
    [ctcsr.loc[[(season, wteam), (season, lteam)], :] for season, wteam, lteam, in result.index])

team1 = merged_teams.iloc[::2, :].reset_index('TeamID') # teams winned
team2 = merged_teams.iloc[1::2, :].reset_index('TeamID') # teams losed

merged_teams = pd.concat([
    pd.concat([team1.add_suffix('1'), team2.add_suffix('2')], axis=1).assign(Res=1),
    pd.concat([team2.add_suffix('1'), team1.add_suffix('2')], axis=1).assign(Res=0),
]).reset_index().set_index(['Season', 'TeamID1', 'TeamID2']).sort_index()

In [19]:
# 結合データから目的変数（Res）の列を除き、説明変数Xを抽出
x_columns = merged_teams.columns[merged_teams.columns != 'Res']
X = merged_teams[x_columns]

# 数値の列を正規化し、文字の列をダミー変数に変換
columns_number = X.select_dtypes(include='number').columns
X.loc[:, columns_number] = MinMaxScaler().fit_transform(X[columns_number])
X = pd.get_dummies(X, columns=x_columns[X.dtypes == 'object'])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FirstD1Season1,Number1,WScore1,LScore1,NumOT1,WFGM1,WFGA1,WFGM31,WFGA31,WFTM1,...,ConfAbbrev2_southland,ConfAbbrev2_summit,ConfAbbrev2_sun_belt,ConfAbbrev2_swac,ConfAbbrev2_wac,ConfAbbrev2_wcc,Region2_W,Region2_X,Region2_Y,Region2_Z
Season,TeamID1,TeamID2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2015,1107,1328,0.482759,0.866667,0.368512,0.405728,0.000000,0.305936,0.338816,0.268817,0.266983,0.500000,...,0,0,0,0,0,0,1,0,0,0
2015,1112,1326,1.000000,0.066667,0.813149,0.723150,0.166667,0.761035,0.761513,0.297491,0.319115,0.929688,...,0,0,0,0,0,0,0,0,0,1
2015,1112,1411,1.000000,0.066667,0.813149,0.723150,0.166667,0.761035,0.761513,0.297491,0.319115,0.929688,...,0,0,0,1,0,0,0,0,0,1
2015,1112,1458,1.000000,0.066667,0.813149,0.723150,0.166667,0.761035,0.761513,0.297491,0.319115,0.929688,...,0,0,0,0,0,0,0,0,0,1
2015,1112,1462,1.000000,0.066667,0.813149,0.723150,0.166667,0.761035,0.761513,0.297491,0.319115,0.929688,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,1449,1429,1.000000,0.533333,0.509227,0.581543,0.000000,0.485540,0.544408,0.440860,0.503949,0.395833,...,0,0,0,0,0,0,0,0,1,0
2019,1458,1332,1.000000,0.266667,0.385813,0.370724,0.000000,0.406393,0.444079,0.376344,0.342812,0.156250,...,0,0,0,0,0,0,0,0,0,1
2019,1459,1246,0.620690,0.400000,0.682238,0.659507,0.000000,0.680365,0.653783,0.770609,0.696682,0.270833,...,0,0,0,0,0,0,0,0,1,0
2019,1459,1371,0.620690,0.400000,0.682238,0.659507,0.000000,0.680365,0.653783,0.770609,0.696682,0.270833,...,0,0,0,0,0,0,0,0,1,0


In [20]:
# 目的変数yを設定
y = merged_teams['Res']
y

Season  TeamID1  TeamID2
2015    1107     1328       0
        1112     1326       1
                 1411       1
                 1458       0
                 1462       1
                           ..
2019    1449     1429       1
        1458     1332       0
        1459     1246       0
                 1371       1
        1463     1261       0
Name: Res, Length: 670, dtype: int64

In [21]:
# 学習アルゴリズムとパrメータ候補を定義
clfs = {}

# SVC
clfs['SVC'] = {
    'instance': SVC(probability=True),
    'params': [
        {'kernel': ['linear'], 'C': [0.01, 0.05, 0.1, 0.5, 1]},
        {'kernel': ['rbf'], 'C': [1, 10, 50, 100, 250], 'gamma': [0.1, 0.2, 0.3]}
    ]    
}

# RandomForest
clfs['RandomForestClassifier'] = {
    'instance': RandomForestClassifier(n_jobs=-1),
    'params': {        
        'n_estimators': [25, 50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 25, 50, None]
    }
}

# LogisticRegression
clfs['LogisticRegression'] = {
    'instance': LogisticRegression(max_iter=200, n_jobs=-1),
    'params': [
            {'penalty': ['l2'], 'C': [0.1, 0.5, 1, 5, 10]},
            {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.1, 0.5, 1, 5, 10]},
            {'penalty': ['elasticnet'], 'C': [0.1, 0.5, 1, 5, 10], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
        ]
}

In [22]:
# 学習し、最適なパラメータを探索
for clf_name, clf in clfs.items():
    print('<{}>'.format(clf_name))
    print('  training ...')
    
    # 学習
    gs = GridSearchCV(clf['instance'], param_grid=clf['params'], cv=5, n_jobs=-1)
    gs.fit(X, y)
    
    print('  best_score: {:.3f}'.format(gs.best_score_))
    print('  best_params: {}'.format(gs.best_params_))
    
    # 最適なパラメータを記録
    clfs[clf_name]['best_params'] = gs.best_params_

<SVC>
  training ...
  best_score: 0.704
  best_params: {'C': 0.05, 'kernel': 'linear'}
<RandomForestClassifier>
  training ...
  best_score: 0.669
  best_params: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 25}
<LogisticRegression>
  training ...
  best_score: 0.716
  best_params: {'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear'}


 0.70597015 0.71641791 0.71641791 0.70746269 0.70746269 0.66865672
 0.67910448 0.6641791  0.68059701        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan]


In [23]:
# 最適なパラメータ値で、全データで学習してみる
for clf_name, clf in clfs.items():
    clf['best_estimator'] = clf['instance'].set_params(**clf['best_params']).fit(X, y)



In [24]:
[clf['best_estimator'] for clf in clfs.values()]

[SVC(C=0.05, kernel='linear', probability=True),
 RandomForestClassifier(max_depth=10, n_estimators=25, n_jobs=-1),
 LogisticRegression(C=0.5, max_iter=200, n_jobs=-1, penalty='l1',
                    solver='liblinear')]

In [25]:
# 各モデルの予測結果から多数決を採用するソフト分類器を作成
vote = VotingClassifier(
    estimators=[(clf_name, clf['best_estimator']) for clf_name, clf in clfs.items()], 
    voting='soft',
    n_jobs=-1
)
vote.fit(X, y)

# 作成したソフト分類器を、clfsに追加
clfs['Vote'] = {}
clfs['Vote']['best_estimator'] = vote

In [26]:
# 各モデルとソフト分類器で、予測結果を比較する
# randomforestだけ精度が異常に高いのは、多分過学習してる。。。
for clf_name, clf in clfs.items():
    score = accuracy_score(y, clf['best_estimator'].predict(X))
    print(clf_name, score)

SVC 0.7686567164179104
RandomForestClassifier 0.9880597014925373
LogisticRegression 0.7582089552238805
Vote 0.8552238805970149


In [27]:
# 各モデルの予測結果をDataFrameに整形（pp: predict_proba, p: predict）
predict = pd.DataFrame(
    {
        **{'pp_' + clf_name: clf['best_estimator'].predict_proba(X)[:, 1] for clf_name, clf in clfs.items()},
        **{'p_' + clf_name: clf['best_estimator'].predict(X) for clf_name, clf in clfs.items()}
    },
    index=X.index)
predict

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pp_SVC,pp_RandomForestClassifier,pp_LogisticRegression,pp_Vote,p_SVC,p_RandomForestClassifier,p_LogisticRegression,p_Vote
Season,TeamID1,TeamID2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015,1107,1328,0.187545,0.186701,0.175813,0.175917,0,0,0,0
2015,1112,1326,0.616473,0.743509,0.663507,0.725911,1,1,1,1
2015,1112,1411,0.849774,0.856389,0.898812,0.913982,1,1,1,1
2015,1112,1458,0.373823,0.213289,0.424925,0.320114,0,0,0,0
2015,1112,1462,0.710161,0.920600,0.734261,0.772000,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
2019,1449,1429,0.609537,0.622383,0.547713,0.605055,1,1,1,1
2019,1458,1332,0.621363,0.236541,0.713540,0.551583,1,0,1,1
2019,1459,1246,0.295407,0.234595,0.303738,0.273586,0,0,0,0
2019,1459,1371,0.656698,0.722587,0.641834,0.676628,1,1,1,1


In [28]:
# 提出ファイルにカラムを追加し、予測結果の値を上書き
for column in predict.columns:
    sub[column] = 0.5
    
mask = [idx for idx in sub.index if idx in X.index]
sub.loc[mask, predict.columns] = predict.loc[mask, predict.columns]
sub

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,pp_SVC,pp_RandomForestClassifier,pp_LogisticRegression,pp_Vote,p_SVC,p_RandomForestClassifier,p_LogisticRegression,p_Vote
Season,Team1,Team2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015,1107,1112,2015_1107_1112,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1116,2015_1107_1116,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1124,2015_1107_1124,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1125,2015_1107_1125,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1129,2015_1107_1129,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
2019,1449,1459,2019_1449_1459,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1449,1463,2019_1449_1463,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1458,1459,2019_1458_1459,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1458,1463,2019_1458_1463,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [29]:
# 予測結果をファイル出力
for column in predict.columns:
    sub[['ID', column]].rename(columns={column: 'pred'}).to_csv('{}.csv'.format(column), index=False)

In [31]:
sub

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,pp_SVC,pp_RandomForestClassifier,pp_LogisticRegression,pp_Vote,p_SVC,p_RandomForestClassifier,p_LogisticRegression,p_Vote
Season,Team1,Team2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015,1107,1112,2015_1107_1112,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1116,2015_1107_1116,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1124,2015_1107_1124,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1125,2015_1107_1125,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2015,1107,1129,2015_1107_1129,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
2019,1449,1459,2019_1449_1459,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1449,1463,2019_1449_1463,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1458,1459,2019_1458_1459,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2019,1458,1463,2019_1458_1463,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
