# Import train dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

mpl.rcParams['figure.figsize'] = (13, 13)

dataset = pd.read_csv('dota2Train.csv', header=None)

# Correlation Matrix

In [None]:
import seaborn as sns

def table_corr(df):
    sns.heatmap(df.corr(), annot=True, cmap='summer')
    plt.show()
    
table_corr(dataset)

# Get target and data features

Checking the correlation matrix, we can see that columns 1,2,3 doesnt affect our model in a good way.

In [2]:
X_train = dataset.iloc[0:, 4:]
y_train = dataset.iloc[:, 0]

# Import test dataset

In [3]:
test_dataset = pd.read_csv('dota2Test.csv', header=None)
X_test = test_dataset.iloc[0:, 4:]
y_test = test_dataset.iloc[:, 0]

# KNN

## Calculate best number of neighbors

In [None]:
n = np.arange(1,5)
train_accuracy = np.empty(len(n))
test_accuracy = np.empty(len(n))

for i, k in enumerate(n):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
        #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
# Generate plot of overfitting and underfitting
plt.title('k-NN accuracy: Varying Number of Neighbors')
plt.plot(n, test_accuracy, label = 'Testing Accuracy')
plt.plot(n, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

## Prediction

In [4]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)

In [5]:
from sklearn.metrics import accuracy_score

acc_knn = accuracy_score(y_test, pred)
print(acc_knn)

0.520400233146


# Naive bayes

In [43]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)

In [44]:
acc = accuracy_score(y_test, pred)
print(acc)

0.56401787449


# Decision tree

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

In [46]:
acc_dt = accuracy_score(y_test, pred_dt)
print(acc_dt)

0.519428793472


In [77]:
cm = confusion_matrix(y_test, pred_dt)
print(cm)

[[2394 2398]
 [2549 2953]]


# Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [80]:
acc_rfc = accuracy_score(y_test, pred_rfc)
print(acc_rfc)

0.590829609481
