**Importing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

**Implementing KNN From Scratch**

In [2]:
from collections import Counter

def euclidean_distance(pn1, pn2):
    return np.sqrt(np.sum((pn1 - pn2)**2))

def find_k_nearest_indices(distances, k_neighbors):
    nearest_indices = np.argsort(distances)[:k_neighbors]
    return nearest_indices

def majority_vote(nearest_labels):
    label_counts = Counter(nearest_labels)
    return label_counts.most_common()[0][0]

def predict_knn(x_train, y_train, x_test, k_neighbors=5):
    y_pred = []

    for x in x_test:
        distances = []

        for x_train_instance in x_train:
            distance = euclidean_distance(x, x_train_instance)
            distances.append(distance)

        nearest_indices = find_k_nearest_indices(distances, k_neighbors)
        nearest_labels = y_train[nearest_indices]
        y_pred.append(majority_vote(nearest_labels))

    return y_pred

In [3]:
df = pd.read_csv('players_fifa23.csv')

In [4]:
df

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,158023,L. Messi,Lionel Messi,35,169,67,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,91,91,...,91,88,91,67,66,67,62,53,62,22
1,165153,K. Benzema,Karim Benzema,34,185,81,https://cdn.sofifa.net/players/165/153/23_60.png,France,91,91,...,89,84,89,67,67,67,63,58,63,21
2,188545,R. Lewandowski,Robert Lewandowski,33,185,81,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,91,91,...,86,83,86,67,69,67,64,63,64,22
3,192985,K. De Bruyne,Kevin De Bruyne,31,181,70,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,91,91,...,91,91,91,82,82,82,78,72,78,24
4,231747,K. Mbappé,Kylian Mbappé,23,182,73,https://cdn.sofifa.net/players/231/747/23_60.png,France,91,95,...,92,84,92,70,66,70,66,57,66,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,243725,D. Collins,Darren Collins,21,174,68,https://cdn.sofifa.net/players/243/725/23_60.png,Republic of Ireland,47,56,...,50,44,50,41,38,41,40,36,40,15
18535,261933,Yang Dejiang,Dejiang Yang,17,175,60,https://cdn.sofifa.net/players/261/933/23_60.png,China PR,47,57,...,45,45,45,47,48,47,49,49,49,15
18536,267823,L. Mullan,Liam Mullan,18,170,65,https://cdn.sofifa.net/players/267/823/23_60.png,Northern Ireland,47,67,...,52,49,52,46,44,46,46,42,46,17
18537,267824,D. McCallion,Daithí McCallion,17,178,65,https://cdn.sofifa.net/players/267/824/23_60.png,Republic of Ireland,47,61,...,33,33,33,44,42,44,47,49,47,15


**Data Preprocessing**

In [5]:
df.isna().sum()

ID           0
Name         0
FullName     0
Age          0
Height       0
            ..
RWBRating    0
LBRating     0
CBRating     0
RBRating     0
GKRating     0
Length: 90, dtype: int64

In [6]:
df.dropna() #Dropping Null Values

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,158023,L. Messi,Lionel Messi,35,169,67,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,91,91,...,91,88,91,67,66,67,62,53,62,22
1,165153,K. Benzema,Karim Benzema,34,185,81,https://cdn.sofifa.net/players/165/153/23_60.png,France,91,91,...,89,84,89,67,67,67,63,58,63,21
2,188545,R. Lewandowski,Robert Lewandowski,33,185,81,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,91,91,...,86,83,86,67,69,67,64,63,64,22
3,192985,K. De Bruyne,Kevin De Bruyne,31,181,70,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,91,91,...,91,91,91,82,82,82,78,72,78,24
4,231747,K. Mbappé,Kylian Mbappé,23,182,73,https://cdn.sofifa.net/players/231/747/23_60.png,France,91,95,...,92,84,92,70,66,70,66,57,66,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13391,198776,J. Williams,Jonny Williams,28,168,60,https://cdn.sofifa.net/players/198/776/23_60.png,Wales,62,62,...,62,62,62,62,60,62,60,54,60,19
13591,173752,C. Gunter,Chris Gunter,32,180,71,https://cdn.sofifa.net/players/173/752/23_60.png,Wales,62,62,...,60,60,60,62,62,62,62,62,62,19
13678,233946,Tong Lei,Lei Tong,24,178,66,https://cdn.sofifa.net/players/233/946/23_60.png,China PR,62,66,...,62,60,62,63,60,63,62,59,62,16
13698,259680,M. Stamenic,Marko Stamenic,20,188,78,https://cdn.sofifa.net/players/259/680/23_60.png,New Zealand,62,75,...,64,64,64,63,64,63,63,63,63,17


In [25]:
# Print the number of unique positions
num_unique_positions = df['BestPosition'].nunique()
print("Number of positions: {}".format(num_unique_positions))

# Print the list of positions
positions_list = df['BestPosition'].unique().tolist()
print("Positions List: {}".format(positions_list))

Number of positions: 15
Positions List: ['CAM', 'CF', 'ST', 'CM', 'RW', 'GK', 'CB', 'LW', 'CDM', 'LM', 'LB', 'RM', 'RB', 'LWB', 'RWB']


**Grouping Positions to the Major Positions (i.e Forwards,MidFielders, Defenders and GoalKeepers)**

Grouping CF, ST, LW, RW -> FW (Forwards)

Grouping CM, CAM, CDM,LM, RM -> MF (MidFielders)

Grouping LB, RB, RWB, LWB, CB -> DF (Defenders)

Grouping GK -> GK (Goalkeepers)

In [8]:
# Create a mapping of sub positions to roles
position_mapping = {
    #Forwards
    'CF': 'FW',
    'ST': 'FW',
    'LW': 'FW',
    'RW': 'FW',
    
    #Midfielders
    'CM': 'MF',
    'CAM': 'MF',
    'CDM': 'MF',
    'LM': 'MF',
    'RM': 'MF',
    
    #Defenders
    'LB': 'DF',
    'RB': 'DF',
    'RWB': 'DF',
    'LWB': 'DF',
    'CB': 'DF',
    
    #Goalkeepers
    'GK': 'GK'
}

# Apply the mapping to the 'Position' column
df['Grouped_Position'] = df['BestPosition'].map(position_mapping)

print(df)

           ID            Name            FullName  Age  Height  Weight  \
0      158023        L. Messi        Lionel Messi   35     169      67   
1      165153      K. Benzema       Karim Benzema   34     185      81   
2      188545  R. Lewandowski  Robert Lewandowski   33     185      81   
3      192985    K. De Bruyne     Kevin De Bruyne   31     181      70   
4      231747       K. Mbappé       Kylian Mbappé   23     182      73   
...       ...             ...                 ...  ...     ...     ...   
18534  243725      D. Collins      Darren Collins   21     174      68   
18535  261933    Yang Dejiang        Dejiang Yang   17     175      60   
18536  267823       L. Mullan         Liam Mullan   18     170      65   
18537  267824    D. McCallion    Daithí McCallion   17     178      65   
18538  261424        N. Rabha         Nabin Rabha   25     176      66   

                                               PhotoUrl          Nationality  \
0      https://cdn.sofifa.net/p

In [9]:
df.columns

Index(['ID', 'Name', 'FullName', 'Age', 'Height', 'Weight', 'PhotoUrl',
       'Nationality', 'Overall', 'Potential', 'Growth', 'TotalStats',
       'BaseStats', 'Positions', 'BestPosition', 'Club', 'ValueEUR', 'WageEUR',
       'ReleaseClause', 'ClubPosition', 'ContractUntil', 'ClubNumber',
       'ClubJoined', 'OnLoad', 'NationalTeam', 'NationalPosition',
       'NationalNumber', 'PreferredFoot', 'IntReputation', 'WeakFoot',
       'SkillMoves', 'AttackingWorkRate', 'DefensiveWorkRate', 'PaceTotal',
       'ShootingTotal', 'PassingTotal', 'DribblingTotal', 'DefendingTotal',
       'PhysicalityTotal', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking',

In [10]:
#Creating a new dataset only with the necessary attributes

attributes = ['FullName', 'AttackingWorkRate', 'DefensiveWorkRate', 'PaceTotal',
       'ShootingTotal', 'PassingTotal', 'DribblingTotal', 'DefendingTotal',
       'PhysicalityTotal', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'Grouped_Position']

fifa_df = df[attributes]
fifa_df

Unnamed: 0,FullName,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,PassingTotal,DribblingTotal,DefendingTotal,PhysicalityTotal,Crossing,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Grouped_Position
0,Lionel Messi,Low,Low,81,89,90,94,34,64,84,...,96,20,35,24,6,11,15,14,8,MF
1,Karim Benzema,Medium,Medium,80,88,83,87,39,78,75,...,90,43,24,18,13,11,5,5,7,FW
2,Robert Lewandowski,High,Medium,75,91,79,86,44,83,71,...,88,35,42,19,15,6,12,8,10,FW
3,Kevin De Bruyne,High,High,74,88,93,87,64,77,94,...,89,68,65,53,15,13,5,10,13,MF
4,Kylian Mbappé,High,Low,97,89,80,92,36,76,78,...,88,26,34,32,13,5,7,11,6,FW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,Darren Collins,Medium,Medium,68,48,43,51,31,33,40,...,47,39,29,27,6,9,5,13,8,MF
18535,Dejiang Yang,Medium,Medium,55,37,41,47,48,39,34,...,45,46,50,52,6,12,11,8,6,MF
18536,Liam Mullan,High,Medium,64,40,49,52,37,35,41,...,59,39,37,48,11,12,8,7,12,MF
18537,Daithí McCallion,Medium,Medium,52,24,25,32,52,41,21,...,41,50,54,54,8,14,13,7,8,DF


In [11]:
fifa_df['Grouped_Position'].unique() #Returning Unique Positions

array(['MF', 'FW', 'GK', 'DF'], dtype=object)

In [12]:
fifa_df['Grouped_Position'].nunique() #Number of Unique Positions

4

1. Encoding AttackingWorkRate and DefendingWorkRate
2. Scaling all attributes

In [13]:
fifa_df['AttackingWorkRate'].unique()

array(['Low', 'Medium', 'High'], dtype=object)

In [14]:
#Encoding the WorkRate columns

WR_map = {'Low':0, 'Medium': 1, 'High': 2}

fifa_df['AttackingWorkRate'] = fifa_df['AttackingWorkRate'].map(WR_map)

fifa_df['DefensiveWorkRate'] = fifa_df['DefensiveWorkRate'].map(WR_map)

print(fifa_df)

                 FullName  AttackingWorkRate  DefensiveWorkRate  PaceTotal  \
0            Lionel Messi                  0                  0         81   
1           Karim Benzema                  1                  1         80   
2      Robert Lewandowski                  2                  1         75   
3         Kevin De Bruyne                  2                  2         74   
4           Kylian Mbappé                  2                  0         97   
...                   ...                ...                ...        ...   
18534      Darren Collins                  1                  1         68   
18535        Dejiang Yang                  1                  1         55   
18536         Liam Mullan                  2                  1         64   
18537    Daithí McCallion                  1                  1         52   
18538         Nabin Rabha                  1                  1         61   

       ShootingTotal  PassingTotal  DribblingTotal  DefendingTo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fifa_df['AttackingWorkRate'] = fifa_df['AttackingWorkRate'].map(WR_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fifa_df['DefensiveWorkRate'] = fifa_df['DefensiveWorkRate'].map(WR_map)


In [15]:
scaler = MinMaxScaler()

# Select only the numerical columns to be scaled
numerical_columns = fifa_df.select_dtypes(include='number').columns

fifa_df[numerical_columns] = scaler.fit_transform(fifa_df[numerical_columns])
fifa_df = fifa_df.round(decimals=2) #Rounding float values to two decimal points

print(fifa_df)

                 FullName  AttackingWorkRate  DefensiveWorkRate  PaceTotal  \
0            Lionel Messi                0.0                0.0       0.77   
1           Karim Benzema                0.5                0.5       0.75   
2      Robert Lewandowski                1.0                0.5       0.68   
3         Kevin De Bruyne                1.0                1.0       0.67   
4           Kylian Mbappé                1.0                0.0       1.00   
...                   ...                ...                ...        ...   
18534      Darren Collins                0.5                0.5       0.58   
18535        Dejiang Yang                0.5                0.5       0.39   
18536         Liam Mullan                1.0                0.5       0.52   
18537    Daithí McCallion                0.5                0.5       0.35   
18538         Nabin Rabha                0.5                0.5       0.48   

       ShootingTotal  PassingTotal  DribblingTotal  DefendingTo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fifa_df[numerical_columns] = scaler.fit_transform(fifa_df[numerical_columns])


In [16]:
fifa_df

Unnamed: 0,FullName,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,PassingTotal,DribblingTotal,DefendingTotal,PhysicalityTotal,Crossing,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Grouped_Position
0,Lionel Messi,0.0,0.0,0.77,0.96,0.96,1.00,0.25,0.56,0.89,...,1.00,0.19,0.33,0.21,0.05,0.10,0.14,0.13,0.07,MF
1,Karim Benzema,0.5,0.5,0.75,0.95,0.85,0.89,0.32,0.79,0.78,...,0.93,0.45,0.21,0.14,0.12,0.10,0.03,0.03,0.06,FW
2,Robert Lewandowski,1.0,0.5,0.68,0.99,0.79,0.88,0.38,0.87,0.74,...,0.90,0.36,0.41,0.15,0.15,0.05,0.11,0.07,0.09,FW
3,Kevin De Bruyne,1.0,1.0,0.67,0.95,1.00,0.89,0.64,0.77,1.00,...,0.92,0.73,0.68,0.56,0.15,0.12,0.03,0.09,0.12,MF
4,Kylian Mbappé,1.0,0.0,1.00,0.96,0.81,0.97,0.28,0.75,0.82,...,0.90,0.26,0.32,0.31,0.12,0.03,0.05,0.10,0.05,FW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,Darren Collins,0.5,0.5,0.58,0.42,0.26,0.35,0.21,0.05,0.39,...,0.41,0.40,0.26,0.25,0.05,0.08,0.03,0.12,0.07,MF
18535,Dejiang Yang,0.5,0.5,0.39,0.28,0.24,0.29,0.43,0.15,0.32,...,0.39,0.48,0.51,0.55,0.05,0.11,0.10,0.07,0.05,MF
18536,Liam Mullan,1.0,0.5,0.52,0.32,0.35,0.36,0.29,0.08,0.40,...,0.55,0.40,0.36,0.50,0.10,0.11,0.07,0.06,0.11,MF
18537,Daithí McCallion,0.5,0.5,0.35,0.11,0.00,0.06,0.49,0.18,0.17,...,0.34,0.53,0.55,0.57,0.07,0.14,0.12,0.06,0.07,DF


**Fitting the Model**

In [17]:
# Splitting into x (input features) & y (target values) 
y = fifa_df['Grouped_Position']
x = fifa_df.drop('Grouped_Position', axis=1)
print(x)

                 FullName  AttackingWorkRate  DefensiveWorkRate  PaceTotal  \
0            Lionel Messi                0.0                0.0       0.77   
1           Karim Benzema                0.5                0.5       0.75   
2      Robert Lewandowski                1.0                0.5       0.68   
3         Kevin De Bruyne                1.0                1.0       0.67   
4           Kylian Mbappé                1.0                0.0       1.00   
...                   ...                ...                ...        ...   
18534      Darren Collins                0.5                0.5       0.58   
18535        Dejiang Yang                0.5                0.5       0.39   
18536         Liam Mullan                1.0                0.5       0.52   
18537    Daithí McCallion                0.5                0.5       0.35   
18538         Nabin Rabha                0.5                0.5       0.48   

       ShootingTotal  PassingTotal  DribblingTotal  DefendingTo

In [19]:
#Encoding Positions to O,1,2....

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [20]:
#Splitting into testing and training sets
x_train, x_test, y_train_enc, y_test_enc = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [22]:
k_neighbors = fifa_df['Grouped_Position'].nunique()

# Dropping Names as they have no correlation with Position
x_train_names_dropped = x_train.drop("FullName", axis=1)
x_test_names_dropped = x_test.drop("FullName", axis=1)

# Converting Values to Float in order for KNN to work
x_train_numeric = x_train_names_dropped.values.astype(float)
x_test_numeric = x_test_names_dropped.values.astype(float)

# Getting predicted encoded positions
y_pred_enc = predict_knn(x_train_numeric, y_train_enc, x_test_numeric, k_neighbors)

# Calculate accuracy using the accuracy_score method
acc = accuracy_score(y_test_enc, y_pred_enc)
print(f"Accuracy: {acc:.2f}")

Accuracy: 0.88


In [57]:
# Inverse Label Encode y_pred_enc
y_pred_decoded = label_encoder.inverse_transform(y_pred_enc)
y_test_decoded = label_encoder.inverse_transform(y_test_enc)

# Create a DataFrame with predicted positions and original positions
predicted_positions_df = pd.DataFrame({
    'Predicted_Position': y_pred_decoded,
    'Original_Position': y_test_decoded
})

# Reset the indices of both DataFrames to ensure alignment
x_test.reset_index(drop=True, inplace=True)
predicted_positions_df.reset_index(drop=True, inplace=True)

# Concatenate the DataFrames
x_test_with_predictions = pd.concat([x_test, predicted_positions_df], axis=1)

x_test_with_predictions

Unnamed: 0,FullName,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,PassingTotal,DribblingTotal,DefendingTotal,PhysicalityTotal,Crossing,...,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Predicted_Position,Original_Position
0,Rafael António Pereira,0.5,0.5,0.62,0.22,0.34,0.44,0.64,0.67,0.52,...,0.70,0.69,0.67,0.06,0.03,0.07,0.06,0.15,DF,DF
1,Jóhann Berg Guðmundsson,0.5,1.0,0.54,0.71,0.72,0.68,0.61,0.59,0.77,...,0.64,0.66,0.58,0.06,0.05,0.13,0.09,0.07,MF,MF
2,James Léa Siliki,1.0,0.5,0.57,0.61,0.66,0.67,0.71,0.66,0.70,...,0.72,0.75,0.79,0.09,0.11,0.11,0.10,0.11,DF,MF
3,Nicolas Cozza,0.5,0.5,0.68,0.33,0.54,0.62,0.79,0.67,0.60,...,0.82,0.80,0.80,0.05,0.09,0.07,0.13,0.12,DF,DF
4,Sandro Ramírez Castillo,1.0,0.5,0.72,0.76,0.62,0.71,0.21,0.49,0.67,...,0.25,0.22,0.21,0.11,0.16,0.09,0.04,0.07,FW,FW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3703,Santiago Martínez,0.0,0.5,0.52,0.45,0.53,0.50,0.63,0.69,0.50,...,0.67,0.67,0.65,0.05,0.10,0.11,0.09,0.15,MF,MF
3704,Dominik Piła,0.5,0.5,0.78,0.57,0.44,0.53,0.11,0.34,0.58,...,0.22,0.18,0.18,0.12,0.05,0.13,0.12,0.09,MF,MF
3705,Charles Vanhoutte,0.5,0.5,0.49,0.50,0.53,0.61,0.62,0.70,0.41,...,0.65,0.68,0.65,0.14,0.11,0.10,0.06,0.07,MF,MF
3706,Nélson Miguel Castro Oliveira,0.5,0.0,0.58,0.75,0.57,0.65,0.29,0.72,0.66,...,0.27,0.39,0.27,0.07,0.06,0.05,0.06,0.09,FW,FW


In [58]:
x_test_with_predictions[x_test_with_predictions["FullName"] == "Kevin De Bruyne"]

Unnamed: 0,FullName,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,PassingTotal,DribblingTotal,DefendingTotal,PhysicalityTotal,Crossing,...,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Predicted_Position,Original_Position
2040,Kevin De Bruyne,1.0,1.0,0.67,0.95,1.0,0.89,0.64,0.77,1.0,...,0.73,0.68,0.56,0.15,0.12,0.03,0.09,0.12,MF,MF
