In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import sparse

In [2]:
from setup import *
from data_transform import *

### Hypothesis 1

We need feature matrix in a way such that every column represents a unique characteristic (or feature) and every row represents a pass (or instance).

Let's assume this hypothesis-
- Let the first column represent the distance of the player from the sender's team who is closest to the sender, the second column repsents the distance of the second closest player from the sender's team and so on. Thus, the 10th column represents the player from the sender's teeam farthest from the sender.
- In a similar way, let the 11th column represent the distance of the player from the opponent team who is closest to the sender, and likewise the 22nd column will represent the distance of the player farthest from the opponent team farthest from the sender.
- If all players, except the sender, are arranged in the order in which their distances from the sender appear in the hypothesis, the position of receiver in this ordered list is our prediction.

Thus, for every formation of the feature matrix as represented by our hypothesis, we have a prediction.

Let's see how a logistic regression classifier performs in this scenario.

In [3]:
X = np.loadtxt('unscaled_featmat.txt')
Y = np.loadtxt('unscaled_labels.txt')

In [4]:
X.shape,Y.shape

((11678, 21), (11678, 21))

In [5]:
type(X)

numpy.ndarray

In [6]:
type(Y)

numpy.ndarray

In [7]:
Y_list = []
for row in Y:
    for i,element in enumerate(row):
        if element == 1.0:
            Y_list.append(i+1)
            continue

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_list, random_state = 10,test_size=0.2)
sXtr = sparse.csr_matrix(X_train)
sXte = sparse.csr_matrix(X_test)

In [9]:
# Logistic Regression

log = LogisticRegression(penalty='l2')
log.fit(sXtr,Y_train)
training_accuracy = log.score(sXtr, Y_train)
test_accuracy = log.score(sXte, Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy))
print("Accuracy on test data: %0.2f" %(test_accuracy))

Accuracy on training data: 0.28
Accuracy on test data: 0.28


In [10]:
predicted = log.predict(sXte)
list(predicted).count(1)/len(predicted)

0.7577054794520548

In [11]:
#Naive Bayes
from sklearn import naive_bayes
cnb = naive_bayes.MultinomialNB()
cnb.fit(sXtr,Y_train)
training_accuracy_nb = cnb.score(sXtr,Y_train)
test_accuracy_nb = cnb.score(sXte,Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy_nb))
print("Accuracy on test data: %0.2f" %(test_accuracy_nb))

Accuracy on training data: 0.07
Accuracy on test data: 0.07


In [12]:
predicted_nb = cnb.predict(sXte)
list(predicted_nb).count(10)/len(predicted_nb)
#pd.DataFrame(columns=[Y_test,predicted_nb])

0.20248287671232876

In [13]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier as rf
crf = rf(n_estimators=1)
crf.fit(sXtr,Y_train)

training_accuracy_rf = crf.score(sXtr,Y_train)
test_accuracy_rf = crf.score(sXte,Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy_rf))
print("Accuracy on test data: %0.2f" %(test_accuracy_rf))

predictedrf = crf.predict(sXte)
probrf = crf.predict_proba(sXte)

Accuracy on training data: 0.70
Accuracy on test data: 0.16


In [14]:
metrics=pd.DataFrame()
metrics['Metric']=['Test Accuracy','Train Accuracy']
metrics.set_index('Metric')
metrics['RF'] = [test_accuracy_rf*100,training_accuracy_rf*100]
metrics['NB'] = [test_accuracy_nb*100,training_accuracy_nb*100]
metrics['LR'] = [test_accuracy*100,training_accuracy*100]
metrics

Unnamed: 0,Metric,RF,NB,LR
0,Test Accuracy,16.395548,6.763699,27.782534
1,Train Accuracy,69.685292,7.43952,27.820595


In [15]:
a=pd.DataFrame()
a['Original Test Labels']=Y_test
a['RF predicted'] = predictedrf
a['NB predicted'] = predicted_nb
a['LR predictd'] = predicted
a.head()

Unnamed: 0,Original Test Labels,RF predicted,NB predicted,LR predictd
0,12,7,18,3
1,4,4,12,1
2,9,21,12,1
3,2,3,12,1
4,1,2,5,1


In [16]:
b=pd.DataFrame()
b['Original Train Labels']=Y_train
b['RF predicted'] = crf.predict(sXtr)
b['NB predicted'] = cnb.predict(sXtr)
b['LR predictd'] = log.predict(sXtr)
b.head()

Unnamed: 0,Original Train Labels,RF predicted,NB predicted,LR predictd
0,2,2,12,1
1,2,1,11,1
2,2,2,12,1
3,1,1,21,1
4,9,9,17,2


The accuracy indicates that the assumed hypothesis is flawed. We do not need to see the other evaluation metrics because the misclassification error is very poor in the first place.
A better hypothesis could be chosen.

There are a number of instances where a player passes the ball unintentionally to a player from the other team (called an intercept).
A more obvious way of improving the hypothesis would be to delete all cases of intercept.
Let's do that and see where it takes us. An additional advantage of doing so would be a drastic reduction in the number of labels. Since, we would no longer consider passes to players from the opposite team, we are left only with 10 classes, each representing a player from the sender's team.

In [17]:
# Delete unintentional passes
file = pd.read_csv('passes.csv')
data =transform(file)

a = data[data.sender > 14]
b = a[a.receiver > 14]
c = data[data.sender < 15]
d = c[c.receiver < 15]
frames=[b,d]
pd.concat(frames)
newdata = pd.concat(frames)
newdata.index = [i for i in range(newdata.shape[0])]

In [18]:
newdata.describe()

Unnamed: 0,sender,receiver,time_start,time_end
count,10047.0,10047.0,10047.0,10047.0
mean,13.283269,13.463322,1343315.0,1344877.0
std,8.227732,8.256901,815460.2,815451.2
min,1.0,1.0,100.0,100.0
25%,6.0,6.0,636200.0,637550.0
50%,12.0,13.0,1323900.0,1325200.0
75%,20.0,21.0,2046300.0,2047850.0
max,28.0,28.0,2940200.0,2941000.0


In [19]:
remove = []
# Same sender and receiver
for index_ in newdata.index:
    if newdata.sender[index_] == newdata.receiver[index_]:
        remove.append(index_)
        print (index_)
        
# Receiver is nan
for index_ in newdata.index:
    if (np.isfinite(newdata.receiver[index_])):
        if (np.isfinite(newdata['P'+str(int(newdata.receiver[index_]))][index_][0]) == False):
            remove.append(index_)
            print (index_)
            
# Sender is nan
for index_ in newdata.index:
    if (np.isfinite(newdata.sender[index_])):
        if (np.isfinite(newdata['P'+str(int(newdata.sender[index_]))][index_][0]) == False):
            remove.append(index_)
            print (index_)

262
873
3467
3731
4289
5852
357
462


In [20]:
map_player(newdata,0,all_players=False)
x,y=make_feature(newdata,0,all_players=False)

In [21]:
import time
start = time.time()
X = np.zeros((1,10))
Y = np.zeros((1,10))
for i in newdata.index:
    if i not in remove :#and i!= len(newdata):
        x,y = make_feature(newdata,i,all_players=False)
        if x.shape[0] == 10:
            X = np.append(X,(np.array([x])),axis=0)
            Y = np.append(Y,y,axis=0)
            
    if i%500 == 0:
        print ('#####  Reached ', i,'/',len(newdata))
            
end = time.time()
            
print ('Time taken to run this block= ',(end-start)/60,' min.')

#####  Reached  0 / 10047
#####  Reached  500 / 10047
#####  Reached  1000 / 10047
#####  Reached  1500 / 10047
#####  Reached  2000 / 10047
#####  Reached  2500 / 10047
#####  Reached  3000 / 10047
#####  Reached  3500 / 10047
#####  Reached  4000 / 10047
#####  Reached  4500 / 10047
#####  Reached  5000 / 10047
#####  Reached  5500 / 10047
#####  Reached  6000 / 10047
#####  Reached  6500 / 10047
#####  Reached  7000 / 10047
#####  Reached  7500 / 10047
#####  Reached  8000 / 10047
#####  Reached  8500 / 10047
#####  Reached  9000 / 10047
#####  Reached  9500 / 10047
#####  Reached  10000 / 10047
Time taken to run this block=  1.9935332655906677  min.


In [22]:
X.shape,Y.shape

((9949, 10), (9949, 10))

In [23]:
X = (np.delete(X, (0), axis=0))
Y =(np.delete(Y, (0), axis=0))

In [24]:
X.shape, Y.shape

((9948, 10), (9948, 10))

In [25]:
Y_list = []
for row in Y:
    for i,element in enumerate(row):
        if element == 1:
            Y_list.append(i+1)
            continue
   # print (2,row)

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_list, random_state = 10,test_size=0.2)
sXtr = sparse.csr_matrix(X_train)
sXte = sparse.csr_matrix(X_test)

In [27]:
# Logistic Regression

log = LogisticRegression(penalty='l2',class_weight='balanced')
log.fit(sXtr,Y_train)
training_accuracy = log.score(sXtr, Y_train)
test_accuracy = log.score(sXte, Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy))
print("Accuracy on test data: %0.2f" %(test_accuracy))

Accuracy on training data: 0.27
Accuracy on test data: 0.26


In [28]:
predicted = log.predict(sXte)
list(predicted).count(1)/len(predicted)

0.37236180904522614

In [29]:
#Naive Bayes
from sklearn import naive_bayes
cnb = naive_bayes.MultinomialNB()
cnb.fit(sXtr,Y_train)
training_accuracy_nb = cnb.score(sXtr,Y_train)
test_accuracy_nb = cnb.score(sXte,Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy_nb))
print("Accuracy on test data: %0.2f" %(test_accuracy_nb))

predicted_nb = cnb.predict(sXte)
list(predicted_nb).count(10)/len(predicted_nb)
#pd.DataFrame(columns=[Y_test,predicted_nb])

Accuracy on training data: 0.17
Accuracy on test data: 0.17


0.2743718592964824

In [30]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier as rf
crf = rf(n_estimators=1)
crf.fit(sXtr,Y_train)

training_accuracy_rf = crf.score(sXtr,Y_train)
test_accuracy_rf = crf.score(sXte,Y_test)

print("Accuracy on training data: %0.2f" %(training_accuracy_rf))
print("Accuracy on test data: %0.2f" %(test_accuracy_rf))

predictedrf = crf.predict(sXte)
probrf = crf.predict_proba(sXte)

Accuracy on training data: 0.71
Accuracy on test data: 0.22


In [31]:
metrics=pd.DataFrame()
metrics['Metric']=['Test Accuracy','Train Accuracy']
metrics.set_index('Metric')
metrics['RF'] = [test_accuracy_rf*100,training_accuracy_rf*100]
metrics['NB'] = [test_accuracy_nb*100,training_accuracy_nb*100]
metrics['LR'] = [test_accuracy*100,training_accuracy*100]
metrics

Unnamed: 0,Metric,RF,NB,LR
0,Test Accuracy,21.809045,17.286432,26.030151
1,Train Accuracy,71.010304,17.265645,27.330988


In [32]:
a=pd.DataFrame()
a['Original Test Labels']=Y_test
a['RF predicted'] = predictedrf
a['NB predicted'] = predicted_nb
a['LR predictd'] = predicted
a

Unnamed: 0,Original Test Labels,RF predicted,NB predicted,LR predictd
0,4,8,10,2
1,1,4,9,4
2,2,4,9,3
3,3,6,9,2
4,1,1,5,10
5,4,7,9,9
6,1,7,1,1
7,3,4,7,5
8,4,1,10,1
9,1,5,10,4


In [33]:
b=pd.DataFrame()
b['Original Train Labels']=Y_train
b['RF predicted'] = crf.predict(sXtr)
b['NB predicted'] = cnb.predict(sXtr)
b['LR predictd'] = log.predict(sXtr)
b

Unnamed: 0,Original Train Labels,RF predicted,NB predicted,LR predictd
0,3,3,9,9
1,4,2,3,4
2,6,6,9,9
3,1,6,10,1
4,1,1,9,4
5,5,5,4,4
6,3,3,10,3
7,1,4,10,2
8,3,2,10,1
9,2,2,2,2
