In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
features = pd.read_csv('/home/legonaftik/PycharmProjects/Введение-в-машинное-обучение-(ВШЭ)/week7/features.csv', 
                       index_col='match_id')
target = features.radiant_win  # целевая переменная, т.к. мы пытаемся предсказать победителя

features = features.drop(["duration", "radiant_win",  # удаляем признаки, связанные с окончанием игры
                          "tower_status_radiant",
                          "tower_status_dire",
                         "barracks_status_radiant",
                          "barracks_status_dire"], axis=1)

In [3]:
def fill_na(data):
    columns_full_check = np.array(data.count() == data.shape[0])
    not_full = np.array(data.columns[columns_full_check==False])
    for col in not_full:
        data[col].fillna(value=0, inplace=True)
    return data

In [4]:
N = 112
C = 0.01

In [5]:
# Добавление мешка слов
def calculate_bag_matrix(data, N):
    X_pick=np.zeros((data.shape[0], N))
    for i, match_id in enumerate(data.index):
        for p in range(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)] - 1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)] - 1] = -1
    return X_pick

In [6]:
def improve_quality(data):
    data = fill_na(data)
    
    data = data.drop(["lobby_type", "r1_hero", "r2_hero",
                         "r3_hero", "r4_hero", "r5_hero",
                         "d1_hero", "d2_hero", "d3_hero", "d4_hero", "d5_hero"], axis=1)

    r_xp = data.r1_xp + data.r2_xp + data.r3_xp + data.r4_xp + data.r5_xp
    d_xp = data.d1_xp + data.d2_xp + data.d3_xp + data.d4_xp + data.d5_xp

    r_kills = data.r1_kills + data.r2_kills + data.r3_kills + data.r4_kills + data.r5_kills  
    d_kills = data.d1_kills + data.d2_kills + data.d3_kills + data.d4_kills + data.d5_kills

    r_level = data.r1_level + data.r2_level + data.r3_level + data.r4_level + data.r5_level  
    d_level = data.d1_level + data.d2_level + data.d3_level + data.d4_level + data.d5_level

    r_gold = data.r1_gold + data.r2_gold + data.r3_gold + data.r4_gold + data.r5_gold  
    d_gold = data.d1_gold + data.d2_gold + data.d3_gold + data.d4_gold + data.d5_gold

    r_lh = data.r1_lh + data.r2_lh + data.r3_lh + data.r4_lh + data.r5_lh  
    d_lh = data.d1_lh + data.d2_lh + data.d3_lh + data.d4_lh + data.d5_lh 
    
    r_deaths = data.r1_deaths + data.r2_deaths + data.r3_deaths + data.r4_deaths + data.r5_deaths  
    d_deaths = data.d1_deaths + data.d2_deaths + data.d3_deaths + data.d4_deaths + data.d5_deaths
    
    r_items = data.r1_items + data.r2_items + data.r3_items + data.r4_items + data.r5_items 
    d_items = data.d1_items + data.d2_items + data.d3_items + data.d4_items + data.d5_items
    
    dif_xp = r_xp - d_xp
    dif_kills = r_kills - d_kills
    dif_level = r_level - d_level
    dif_gold = r_gold - d_gold
    dif_lh = r_lh - d_lh
    dif_deaths = r_deaths - d_deaths
    dif_items = r_items - d_items

    data = data.drop(["r1_xp", "r2_xp", "r3_xp", "r4_xp", "r5_xp",
                    "d1_xp", "d2_xp", "d3_xp", "d4_xp", "d5_xp"], axis=1)
    data = data.drop(["r1_kills", "r2_kills", "r3_kills", "r4_kills", "r5_kills",
                    "d1_kills", "d2_kills", "d3_kills", "d4_kills", "d5_kills"], axis=1)
    data = data.drop(["r1_level", "r2_level", "r3_level", "r4_level", "r5_level",
                    "d1_level", "d2_level", "d3_level", "d4_level", "d5_level"], axis=1)
    data = data.drop(["r1_gold", "r2_gold", "r3_gold", "r4_gold", "r5_gold",
                    "d1_gold", "d2_gold", "d3_gold", "d4_gold", "d5_gold"], axis=1)
    data = data.drop(["r1_lh", "r2_lh", "r3_lh", "r4_lh", "r5_lh",
                    "d1_lh", "d2_lh", "d3_lh", "d4_lh", "d5_lh"], axis=1)
    data = data.drop(["r1_deaths", "r2_deaths", "r3_deaths", "r4_deaths", "r5_deaths",
                    "d1_deaths", "d2_deaths", "d3_deaths", "d4_deaths", "d5_deaths"], axis=1)
    data = data.drop(["r1_items", "r2_items", "r3_items", "r4_items", "r5_items",
                    "d1_items", "d2_items", "d3_items", "d4_items", "d5_items"], axis=1)

    data = np.column_stack((data, dif_xp, dif_kills, dif_level, dif_gold, dif_lh, dif_deaths, dif_items))
    
    return data

In [7]:
fun = improve_quality(features)


# Расчёт для данных с мешком слов
bag = calculate_bag_matrix(features, N)
fun = np.hstack((fun, bag))


scaler = StandardScaler()
fun = scaler.fit_transform(fun)

In [8]:
kf = KFold(n=fun.shape[0], n_folds=5, shuffle=True, random_state=241)
clf = LogisticRegression(penalty="l2", C=C)
clf.fit(fun, target)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
scores = cross_val_score(clf, fun, target, cv=kf, scoring="roc_auc")
mean_score = scores.mean()
mean_score  # 0.75240683217527005

0.75240683217527005

In [10]:
# Приведём к аналогичному виду тестовые данные
features_test = pd.read_csv("/home/legonaftik/PycharmProjects/Введение-в-машинное-обучение-(ВШЭ)/week7/features_test.csv", 
                            index_col="match_id")

fun_test = improve_quality(features_test)


bag_test = calculate_bag_matrix(features_test, N)
fun_test = np.hstack((fun_test, bag_test))
fun_test = scaler.transform(fun_test)

In [11]:
# Отправка результатов предсказания на Kaggle
pred = clf.predict_proba(fun_test)[:, 1]
indexes = list(features_test.index)
with open("/home/legonaftik/PycharmProjects/Введение-в-машинное-обучение-(ВШЭ)/week7/kaggle_answer.csv", "w") as output:
    output.write("match_id,radiant_win"+"\n")
    for i in range(len(indexes)):
        output.write(str(indexes[i])+","+str(pred[i])+"\n")