In [1]:
# import standard libraries
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import itertools

from sklearn.datasets import load_iris
from sklearn import tree, preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from shutil import copyfile

In [2]:
# load train data
train_data = pd.read_csv('../data/MoreManipulatedData_bucket_last_col.csv',usecols=lambda x: 'PLAYER_URL' not in x)

# different classes
train_data.BUCKET.unique()

# convert string values to numerical data
def convert(data):
    number = preprocessing.LabelEncoder()
    data['POS'] = number.fit_transform(data.POS)
    data['LEAGUE'] = number.fit_transform(data.LEAGUE)
    data['FIRST_JUNIOR_YEAR'] = number.fit_transform(data.FIRST_JUNIOR_YEAR)
    data['DOB'] = number.fit_transform(data.DOB)
    data['NATIONALITY'] = number.fit_transform(data.NATIONALITY)
    data['SHOOTS'] = number.fit_transform(data.SHOOTS)
    data=data.fillna(-999)
    return data

# convert string values in data to numerical classes
train_data = convert(train_data)

# Separating the data and the labels
X = np.asarray(train_data[train_data.columns[:-1]])
y = np.asarray(train_data.BUCKET)

# Splitting the data into the train and the test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
sss.get_n_splits(X, y)

train_index, test_index = next(sss.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

print('Training data: \n',X)
print('\n')
print('Training labels: \n',y_train)

Training data: 
 [[103143      3     87 ...    180     84      2]
 [106210      1     73 ...    193     95      0]
 [106207      2     21 ...    183     86      0]
 ...
 [249307      1    130 ...    188     98      3]
 [485033      3     59 ...    175     68      2]
 [410552      2      1 ...    183     82      2]]


Training labels: 
 ['0' '0' '0' ... '0' '0' '0']


In [17]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier()
MLP.n_layers_ = 6

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.18      0.72      0.29       408
    201-600        0.14      0.14      0.14       156
    601+           0.50      0.09      0.15       127
           0       0.98      0.80      0.88      6009

    accuracy                           0.76      6700
   macro avg       0.45      0.44      0.36      6700
weighted avg       0.90      0.76      0.81      6700



In [18]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier()
MLP.n_layers_ = 4

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.26      0.06      0.10       408
    201-600        0.17      0.37      0.24       156
    601+           0.33      0.30      0.32       127
           0       0.94      0.96      0.95      6009

    accuracy                           0.88      6700
   macro avg       0.43      0.42      0.40      6700
weighted avg       0.87      0.88      0.87      6700



In [19]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier()
MLP.n_layers_ = 6

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.60      0.01      0.01       408
    201-600        0.25      0.01      0.02       156
    601+           0.59      0.10      0.17       127
           0       0.90      1.00      0.95      6009

    accuracy                           0.90      6700
   macro avg       0.59      0.28      0.29      6700
weighted avg       0.86      0.90      0.85      6700



In [20]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier(hidden_layer_sizes=(200, ))
MLP.n_layers_ = 3

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.24      0.41      0.30       408
    201-600        0.00      0.00      0.00       156
    601+           0.34      0.38      0.36       127
           0       0.95      0.92      0.93      6009

    accuracy                           0.86      6700
   macro avg       0.38      0.43      0.40      6700
weighted avg       0.87      0.86      0.86      6700



In [21]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier(hidden_layer_sizes=(300, ))
MLP.n_layers_ = 3

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.36      0.09      0.15       408
    201-600        0.33      0.01      0.01       156
    601+           0.33      0.15      0.21       127
           0       0.91      0.99      0.95      6009

    accuracy                           0.90      6700
   macro avg       0.49      0.31      0.33      6700
weighted avg       0.85      0.90      0.87      6700



In [22]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier(hidden_layer_sizes=(400, ))
MLP.n_layers_ = 3

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.36      0.08      0.13       408
    201-600        0.11      0.17      0.13       156
    601+           0.31      0.09      0.13       127
           0       0.92      0.97      0.95      6009

    accuracy                           0.88      6700
   macro avg       0.42      0.33      0.34      6700
weighted avg       0.86      0.88      0.86      6700



In [24]:
from joblib import dump, load
dump(MLP, '../finimlp_hid100_layer3.joblib') 

['mlp_hid100_layer3.joblib']

In [26]:
MLP2 = load('mlp_hid100_layer3.joblib') 
# Predict the test class labels using the trained KNN classifier 
y_pred = MLP2.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.36      0.08      0.13       408
    201-600        0.11      0.17      0.13       156
    601+           0.31      0.09      0.13       127
           0       0.92      0.97      0.95      6009

    accuracy                           0.88      6700
   macro avg       0.42      0.33      0.34      6700
weighted avg       0.86      0.88      0.86      6700

