In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import cross_val_predict

In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [3]:
data = pd.concat([train, test], axis=0)

In [4]:
data.columns

Index([u'champ', u'target', u'team_away', u'team_away_red_cards',
       u'team_away_reserved', u'team_away_score', u'team_away_squad',
       u'team_away_substitution', u'team_away_yellow_cards', u'team_home',
       u'team_home_red_cards', u'team_home_reserved', u'team_home_score',
       u'team_home_squad', u'team_home_substitution',
       u'team_home_yellow_cards', u'time', u'tour'],
      dtype='object')

In [5]:
len(train), len(test), data.shape

(16456, 7211, (23667, 18))

Попробуем предсказать среднюю вероятность класса, после чего посмотрим на качество на обучающей выборке:

In [6]:
preds = [list(train.groupby("target").size()/len(train))]*len(train)
print(log_loss(train.target, preds))

1.0658489559682767


Построим несколько признаков:
* средний прирост счета каждой команды

После чего обучим RandomForest

In [7]:
data["team_home_mean_target"] = data.team_home.map(train.groupby("team_home").target.mean())
data["team_away_mean_target"] = data.team_away.map(train.groupby("team_away").target.mean())

In [8]:
cols = ["team_home_mean_target",
        "team_away_mean_target"]

In [9]:
X_train = data[~data.target.isnull()]
X_test = data[data.target.isnull()]
y_train = data[~data.target.isnull()].target
y_test = data[data.target.isnull()].target

In [10]:
model = RandomForestClassifier(n_estimators=10, max_depth=3)
preds = cross_val_predict(model, X_train[cols], y_train, n_jobs=-1, method="predict_proba")
print("Score is {}".format(log_loss(y_train, preds)))

Score is 1.01104452053


In [11]:
model.fit(X_train[cols], y_train)
model.feature_importances_

array([0.34006513, 0.65993487])

In [12]:
submit = pd.DataFrame(model.predict_proba(X_test[cols].fillna(0)), columns=["draw", "win", "lose"])
submit.to_csv("./input/sample_submission.csv")

In [13]:
data.columns

Index([u'champ', u'target', u'team_away', u'team_away_red_cards',
       u'team_away_reserved', u'team_away_score', u'team_away_squad',
       u'team_away_substitution', u'team_away_yellow_cards', u'team_home',
       u'team_home_red_cards', u'team_home_reserved', u'team_home_score',
       u'team_home_squad', u'team_home_substitution',
       u'team_home_yellow_cards', u'time', u'tour', u'team_home_mean_target',
       u'team_away_mean_target'],
      dtype='object')