In [2]:
from IPython.display import display
import pandas as pd
import sklearn
import catboost
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

x = pd.read_csv('data/train_v2.csv', index_col=0)
y = pd.read_csv('data/target.csv', index_col=0)['radiant_won']

columns_with_single_value = [col for col in x.columns if x[col].unique().shape[0] == 1]
x = x.drop(columns_with_single_value, axis=1)

x['radiant_gold'] = x['r1_gold'] + x['r2_gold'] + x['r3_gold'] + x['r4_gold'] + x['r5_gold']
x['dire_gold'] = x['d1_gold'] + x['d2_gold'] + x['d3_gold'] + x['d4_gold'] + x['d5_gold']
x['kda_diff'] = (x['r1_avg_kda'] + x['r2_avg_kda'] + x['r3_avg_kda'] + x['r4_avg_kda'] +x['r5_avg_kda']) - (x['d1_avg_kda'] + x['d2_avg_kda'] + x['d3_avg_kda'] + x['d4_avg_kda'] +x['d5_avg_kda'])
x['avg_tower_damage_diff'] = (x['r1_avg_tower_damage_per_min'] + x['r2_avg_tower_damage_per_min'] + x['r3_avg_tower_damage_per_min'] + x['r4_avg_tower_damage_per_min'] +x['r5_avg_tower_damage_per_min']) - (x['d1_avg_tower_damage_per_min'] + x['d2_avg_tower_damage_per_min'] + x['d3_avg_tower_damage_per_min'] + x['d4_avg_tower_damage_per_min'] +x['d5_avg_tower_damage_per_min'])
x['deaths_diff'] = (x['r1_deaths'] + x['r2_deaths'] + x['r3_deaths'] + x['r4_deaths'] + x['r5_deaths']) - (x['d1_deaths'] + x['d2_deaths'] + x['d3_deaths'] + x['d4_deaths'] + x['d5_deaths'])
x['kills_diff'] = (x['r1_kills'] + x['r2_kills'] + x['r3_kills'] + x['r4_kills'] + x['r5_kills']) - (x['d1_kills'] + x['d2_kills'] + x['d3_kills'] + x['d4_kills'] + x['d5_kills'])

x = x.fillna(0)

x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=.33, random_state=1)

x_train = x_train.fillna(0)
x_validation = x_validation.fillna(0)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_validation = scaler.fit_transform(x_validation)

features = list(x.columns)

clf = catboost.CatBoostClassifier(
    custom_loss=['Accuracy'], 
    use_best_model=True,
    random_seed=42
)
clf.fit(
    x_train, y_train,
    eval_set=(x_validation, y_validation),
    plot=True
)



KeyboardInterrupt: 

In [18]:
print('Train Accuracy:', accuracy_score(y_train, clf.predict(x_train)))
print('Validation Accuracy:', accuracy_score(y_validation, clf.predict(x_validation)))

# depth=6
# Train Accuracy: 0.699069569447823
# Validation Accuracy: 0.6420370370370371

# depth=6 all data 
# Train Accuracy: 0.6987655071758696
# Validation Accuracy: 0.6455555555555555

# depth=6 all data kda, avg, deaths
# Train Accuracy: 0.6679944052541961
# Validation Accuracy: 0.6471604938271605

# depth=6 data kda, avg, deaths, kills
# Train Accuracy: 0.6970931647
# Validation Accuracy: 0.647962963

Train Accuracy: 0.6792143030892727
Validation Accuracy: 0.6463580246913581


In [19]:
df_importances = pd.DataFrame.from_records([
    {'feature': feature_name, 'importance':importance}
    for feature_name, importance in zip(features, clf.feature_importances_)
]) 
df_importances = df_importances.set_index('feature')
df_importances = df_importances.sort_values('importance', ascending=False)
df_importances


Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
kda_diff,1.493549e+01
avg_tower_damage_diff,1.043761e+01
kills_diff,4.547890e+00
deaths_diff,4.335940e+00
r5_avg_kda,2.236008e+00
d4_avg_kda,2.160552e+00
r1_avg_kda,2.029332e+00
r2_avg_kda,1.902531e+00
r3_avg_kda,1.852326e+00
d2_avg_kda,1.763936e+00


In [4]:
#             #######    #######    #####   ########
#                #       #         #           #
#                #       #######   #           #
#                #       #         #           #
#                #       #######    #####      #

x_test = pd.read_csv('data/test_v2.csv', index_col=0)
y_submission = pd.read_csv('data/submission.csv', index_col=0)

x_test['radiant_gold'] = x_test['r1_gold'] + x_test['r2_gold'] + x_test['r3_gold'] + x_test['r4_gold'] + x_test['r5_gold']
x_test['dire_gold'] = x_test['d1_gold'] + x_test['d2_gold'] + x_test['d3_gold'] + x_test['d4_gold'] + x_test['d5_gold']
x_test['gold_diff'] = x_test['radiant_gold'] - x_test['dire_gold']
x_test['kda_diff'] = (x_test['r1_avg_kda'] + x_test['r2_avg_kda'] + x_test['r3_avg_kda'] + x_test['r4_avg_kda'] +x_test['r5_avg_kda']) - (x_test['d1_avg_kda'] + x_test['d2_avg_kda'] + x_test['d3_avg_kda'] + x_test['d4_avg_kda'] +x_test['d5_avg_kda'])
x_test['avg_tower_damage_diff'] = (x_test['r1_avg_tower_damage_per_min'] + x_test['r2_avg_tower_damage_per_min'] + x_test['r3_avg_tower_damage_per_min'] + x_test['r4_avg_tower_damage_per_min'] +x_test['r5_avg_tower_damage_per_min']) - (x_test['d1_avg_tower_damage_per_min'] + x_test['d2_avg_tower_damage_per_min'] + x_test['d3_avg_tower_damage_per_min'] + x_test['d4_avg_tower_damage_per_min'] +x_test['d5_avg_tower_damage_per_min'])
x_test['deaths_diff'] = (x_test['r1_deaths'] + x_test['r2_deaths'] + x_test['r3_deaths'] + x_test['r4_deaths'] + x_test['r5_deaths']) - (x_test['d1_deaths'] + x_test['d2_deaths'] + x_test['d3_deaths'] + x_test['d4_deaths'] + x_test['d5_deaths'])
x_test['xp_diff'] = (x_test['r1_xp'] + x_test['r2_xp'] + x_test['r3_xp'] + x_test['r4_xp'] + x_test['r5_xp']) - (x_test['d1_xp'] + x_test['d2_xp'] + x_test['d3_xp'] + x_test['d4_xp'] + x_test['d5_xp'])
x_test.drop(['radiant_gold', 'dire_gold'], axis=1, inplace=True)

x_test = x_test.fillna(0)

x_test = x_test[features]

scaler = MinMaxScaler()
x_test = scaler.fit_transform(x_test)

y_submission['radiant_won'] = clf.predict(x_test)

import time
import os

current_timestamp = int(time.time())
submission_path = 'submissions/{}.csv'.format(current_timestamp)

if not os.path.exists('submissions'):
    os.makedirs('submissions')

print(submission_path)
y_submission.to_csv(submission_path, index=True)

KeyError: "['radiant_gold' 'dire_gold'] not in index"

In [1]:
x = pd.read_csv('data/train_v2.csv')
x

NameError: name 'pd' is not defined