In [1]:
# import standard libraries
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import itertools

from sklearn.datasets import load_iris
from sklearn import tree, preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from shutil import copyfile

In [2]:
# load train data
train_data = pd.read_csv('../data/MoreManipulatedData_bucket_last_col.csv',usecols=lambda x: 'PLAYER_URL' not in x)

# different classes
train_data.BUCKET.unique()

# convert string values to numerical data
def convert(data):
    number = preprocessing.LabelEncoder()
    data['POS'] = number.fit_transform(data.POS)
    data['LEAGUE'] = number.fit_transform(data.LEAGUE)
    data['FIRST_JUNIOR_YEAR'] = number.fit_transform(data.FIRST_JUNIOR_YEAR)
    data['DOB'] = number.fit_transform(data.DOB)
    data['NATIONALITY'] = number.fit_transform(data.NATIONALITY)
    data['SHOOTS'] = number.fit_transform(data.SHOOTS)
    data=data.fillna(-999)
    return data

# convert string values in data to numerical classes
train_data = convert(train_data)

# Separating the data and the labels
X = np.asarray(train_data[train_data.columns[:-1]])
y = np.asarray(train_data.BUCKET)

# Splitting the data into the train and the test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
sss.get_n_splits(X, y)

train_index, test_index = next(sss.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

print('Training data: \n',X)
print('\n')
print('Training labels: \n',y_train)

Training data: 
 [[103143      3     87 ...    180     84      2]
 [106210      1     73 ...    193     95      0]
 [106207      2     21 ...    183     86      0]
 ...
 [249307      1    130 ...    188     98      3]
 [485033      3     59 ...    175     68      2]
 [410552      2      1 ...    183     82      2]]


Training labels: 
 ['0' '0' '0' ... '0' '0' '0']


In [30]:
# K Nearest Neighbors Classifier (has to be able to deal with floats)
MLP = MLPClassifier(hidden_layer_sizes=(100, ))
MLP.n_layers_ = 8

# fit the classifier using the training data
MLP = MLP.fit(X_train, y_train)

# Predict the test class labels using the trained KNN classifier 
y_pred = MLP.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.67      0.01      0.02       408
    201-600        0.21      0.20      0.20       156
    601+           1.00      0.04      0.08       127
           0       0.91      0.99      0.95      6009

    accuracy                           0.90      6700
   macro avg       0.70      0.31      0.31      6700
weighted avg       0.88      0.90      0.86      6700



In [31]:
from joblib import dump, load
dump(MLP, '../finished_models/mlp_hid100_layer8.joblib') 

['../finished_models/mlp_hid100_layer8.joblib']

In [34]:
MLP2 = load('../finished_models/mlp_hid100_layer8.joblib') 
# Predict the test class labels using the trained KNN classifier 
y_pred = MLP2.predict(X_test)

# print accuracy of the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    1-200          0.67      0.01      0.02       408
    201-600        0.21      0.20      0.20       156
    601+           1.00      0.04      0.08       127
           0       0.91      0.99      0.95      6009

    accuracy                           0.90      6700
   macro avg       0.70      0.31      0.31      6700
weighted avg       0.88      0.90      0.86      6700

