In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
df = pd.read_csv("fifer.csv")

In [3]:
df['Club Jersey Number'].head()

0    30
1     9
2     9
3    17
4     7
Name: Club Jersey Number, dtype: object

In [4]:
df.columns

Index(['Known As', 'Full Name', 'Overall', 'Potential', 'Value(in Euro)',
       'Positions Played', 'Best Position', 'Nationality', 'Image Link', 'Age',
       'Height(in cm)', 'Weight(in kg)', 'TotalStats', 'BaseStats',
       'Club Name', 'Wage(in Euro)', 'Release Clause', 'Club Position',
       'Contract Until', 'Club Jersey Number', 'Joined On', 'On Loan',
       'Preferred Foot', 'Weak Foot Rating', 'Skill Moves',
       'International Reputation', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Powe

In [5]:
#I only isolate the players with the associated numbers I want the AI to predict
t = 'Club Jersey Number'
df = df[df[t].isin(['1','2','3','4','5','6','7','8','9','10','11'])]  

In [6]:
#these are going to be the columns we use in the machine learning models, then we isolate our y output as the jersey number and remove it from the rest
things = ['Overall', 'Potential',
       'Best Position', 
       'Club Jersey Number', 'Skill Moves',
       'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'Standing Tackle', 'Sliding Tackle',
       'Goalkeeper Diving', 'Goalkeeper Handling', ' GoalkeeperKicking',
       'Goalkeeper Positioning', 'Goalkeeper Reflexes']
df = df[things]
X = df.drop("Club Jersey Number", axis="columns")
y = df["Club Jersey Number"]

In [7]:
X = pd.get_dummies(X)

In [8]:
X.head()

Unnamed: 0,Overall,Potential,Skill Moves,Pace Total,Shooting Total,Passing Total,Dribbling Total,Defending Total,Physicality Total,Crossing,...,Best Position_RM,Best Position_RW,Best Position_RWB,Best Position_ST,Attacking Work Rate_High,Attacking Work Rate_Low,Attacking Work Rate_Medium,Defensive Work Rate_High,Defensive Work Rate_Low,Defensive Work Rate_Medium
1,91,91,4,80,88,83,87,39,78,75,...,False,False,False,False,False,False,True,False,False,True
2,91,91,4,75,91,79,86,44,83,71,...,False,False,False,True,True,False,False,False,False,True
4,91,95,5,97,89,80,92,36,76,78,...,False,False,False,True,True,False,False,False,True,False
5,90,90,4,90,89,82,90,45,75,80,...,False,True,False,False,True,False,False,False,False,True
6,90,91,1,84,89,75,90,46,89,14,...,False,False,False,False,False,False,True,False,False,True


In [9]:
#Declare a scaler and fit it to the x values to ensure that the model is not skewed by any large numbers and whatnot
scaler = StandardScaler()

In [10]:
scaler.fit(X)
#X = scaler.transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10, shuffle=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
#model =ensemble.GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.1, max_depth=10,min_samples_split=4,min_samples_leaf=6,max_features=0.6,loss='log_loss')
#model = RandomForestClassifier(n_estimators = 1000)
#model = DecisionTreeClassifier()
model = KNeighborsClassifier(n_neighbors = 111)

In [13]:
model.fit(X_train, y_train)

In [14]:
p = model.predict(X_test)

In [15]:
print(confusion_matrix(y_test, p))
print(classification_report(y_test, p))

[[136   0   0   0   0   0   0   0   0   0   0]
 [  0  46  13   0   2   0   0   1  20  24  27]
 [  0  30  24   0   8   0   0   0  34   7  42]
 [  0   1   0  68  14  28  10   5   7   2   0]
 [  0   0   2  12  43  58  21   3   0   1   0]
 [  0   1   0  13   6  84  21  14   3  14   1]
 [  0   5   0   7  17  65  30   8   0  22   1]
 [  0   9   3   7   4  31  20  23   7  43   1]
 [  0  28  24   3   3   0   1   5  28  22  21]
 [  0  22   4   3   4   4   2  15  12  70   8]
 [  0   8   3   0   0   0   0   0   7   1 130]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       136
          10       0.31      0.35      0.33       133
          11       0.33      0.17      0.22       145
           2       0.60      0.50      0.55       135
           3       0.43      0.31      0.36       140
           4       0.31      0.54      0.39       157
           5       0.29      0.19      0.23       155
           6       0.31      0.16      0.21      

In [35]:
thing = pd.read_csv("fifer.csv")
pn = 18
thing.iloc[[pn]].head()

Unnamed: 0,Known As,Full Name,Overall,Potential,Value(in Euro),Positions Played,Best Position,Nationality,Image Link,Age,...,LM Rating,CM Rating,RM Rating,LWB Rating,CDM Rating,RWB Rating,LB Rating,CB Rating,RB Rating,GK Rating
18,Alisson,Alisson Ramses Becker,89,90,79000000,GK,GK,Brazil,https://cdn.sofifa.net/players/212/831/23_60.png,29,...,45,49,45,36,42,36,35,35,35,90


In [36]:
things = ['Overall', 'Potential',
       'Best Position', 
       'Club Jersey Number', 'Skill Moves',
       'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'Standing Tackle', 'Sliding Tackle',
       'Goalkeeper Diving', 'Goalkeeper Handling', ' GoalkeeperKicking',
       'Goalkeeper Positioning', 'Goalkeeper Reflexes']
thing = thing[things]
thing = thing.drop("Club Jersey Number", axis="columns")
thing = pd.get_dummies(thing)

In [37]:
player = thing.iloc[[pn]]
#player.head()

In [38]:
player = scaler.transform(player)

In [39]:
model.predict(player)

array(['1'], dtype=object)

In [40]:
import eli5
from eli5.sklearn import PermutationImportance

# Make a small change to the code below to use in this problem. 
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)

# uncomment the following line to visualize your results
eli5.show_weights(perm, feature_names = X.columns.tolist(), top=None)

Weight,Feature
0.0224  ± 0.0047,Best Position_RB
0.0179  ± 0.0063,Best Position_LB
0.0094  ± 0.0044,Best Position_LWB
0.0089  ± 0.0014,Best Position_RWB
0.0068  ± 0.0053,Overall
0.0067  ± 0.0056,Vision
0.0060  ± 0.0013,Sprint Speed
0.0056  ± 0.0048,Reactions
0.0052  ± 0.0058,Positioning
0.0052  ± 0.0028,Volleys
