# Basic Data PreProcessing

In [1]:
import pandas as pd

# Load the datasets
sc_data = pd.read_csv('matches-modifiedd.csv')
player_mvp_stats = pd.read_csv('Soccer_Players_MVP.csv')


In [2]:
team_mapping = {
    'ARS': 'ARS', 'AVL': 'AVL', 'BOU': 'BOU', 'BRE': 'BRE',
    'BHA': 'BHA', 'CHE': 'CHE', 'CRY': 'CRY', 'EVE': 'EVE',
    'FUL': 'FUL', 'LEE': 'LEE', 'LEI': 'LEI', 'LIV': 'LIV',
    'MCI': 'MCI', 'MUN': 'MUN', 'NEW': 'NEW', 'NFO': 'NFO',
    'SOU': 'SOU', 'TOT': 'TOT', 'WHU': 'WHU', 'WOL': 'WOL'
}



In [3]:
player_mvp_stats['Tm'] = player_mvp_stats['Tm'].map(team_mapping)


In [4]:
combined_data = pd.merge(sc_data, player_mvp_stats, left_on='team', right_on='Tm', how='inner')


In [5]:
combined_data.head()


Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,Player,Team,Tm,Rating,PAC,SHO,PAS,DRI,DEF,PHY
0,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,...,Kevin De Bruyne,Manchester City,MCI,91,74,88,93,87,64,77
1,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,...,Ederson,Manchester City,MCI,89,87,82,88,64,88,88
2,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,...,Erling Haaland,Manchester City,MCI,88,89,80,49,87,87,87
3,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,...,Joao Cancelo,Manchester City,MCI,88,85,73,85,85,81,73
4,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,...,Ruben Dias,Manchester City,MCI,88,63,66,68,88,88,88


In [6]:
# Select and display only the 'Player' and 'Tm' columns
player_and_team_data = combined_data[['Player', 'Tm']]
print(player_and_team_data)


                Player   Tm
0      Kevin De Bruyne  MCI
1              Ederson  MCI
2       Erling Haaland  MCI
3         Joao Cancelo  MCI
4           Ruben Dias  MCI
...                ...  ...
15150             Neto  BOU
15151             Neto  BOU
15152             Neto  BOU
15153             Neto  BOU
15154             Neto  BOU

[15155 rows x 2 columns]


In [7]:
# Select and display only the 'team' and 'team_opp' columns
team_and_team_opp_data = combined_data[['team', 'team_opp']]
print(team_and_team_opp_data)


      team        team_opp
0      MCI        West Ham
1      MCI        West Ham
2      MCI        West Ham
3      MCI        West Ham
4      MCI        West Ham
...    ...             ...
15150  BOU             TOT
15151  BOU  Leicester City
15152  BOU             MCI
15153  BOU     Southampton
15154  BOU             EVE

[15155 rows x 2 columns]


# Adding Weights

In [8]:
# Get all player names

unique_players = combined_data['Player'].unique()

pd.set_option('display.max_rows', None)

print(unique_players)

pd.reset_option('display.max_rows')


['Kevin De Bruyne' 'Ederson' 'Erling Haaland' 'Joao Cancelo' 'Ruben Dias'
 'Bernardo Silva' 'Rodri' 'Aymeric Laporte' 'Riyad Mahrez'
 'İlkay Gundogan' 'Phil Foden' 'Kyle Walker' 'Jack Grealish' 'John Stones'
 'Kalvin Phillips' 'Manuel Akanji' 'Martin Odegaard' 'Thomas Partey'
 'Gabriel Jesus' 'Aaron Ramsdale' 'Bukayo Saka' 'Gabriel' 'Kieran Tierney'
 'Emile Smith Rowe' 'Cristiano Ronaldo' 'Casemiro' 'De Gea'
 'Bruno Fernandes' 'Raphael Varane' 'Antony' 'Christian Eriksen'
 'Harry Maguire' 'Lisandro Martinez' 'Marcus Rashford' 'Aaron Wan-Bissaka'
 'Anthony Martial' 'Fred' 'Kieran Trippier' 'ALLAN SAINT-MAXIMIN'
 'Bruno Guimaraes' 'Nick Pope' 'Alexander Isak' 'Heung Min Son'
 'Harry Kane' 'Hugo Lloris' 'Ivan Perisic' 'Cristian Romero'
 'Pierre-Emile Hojbjerg' 'Dejan Kulusevski' 'Richarlison' 'Yves Bissouma'
 'Eric Dier' 'Palhinha' 'Bernd Leno' 'Mohamed Salah' 'Virgil van Dijk'
 'Alisson' 'Fabinho' 'Andrew Robertson' 'Trent Alexander-Arnold' 'Thiago'
 'Diogo Jota' 'JOEL MATIP' 'Luis Diaz'

In [9]:
# Sample mapping of player names to their corresponding weights
player_weights = {
    'Kevin De Bruyne': 0.91,
    'Mohamed Salah': 0.90,
    'Virgil van Dijk': 0.90,
    'Cristiano Ronaldo': 0.90,
    'Heung Min Son': 0.89,
    'Casemiro': 0.89,
    'Alisson': 0.89,
    'Harry Kane': 0.89,
    'Ederson': 0.89,
    "N'Golo Kante": 0.89,
    'Erling Haaland': 0.88,
    'Joao Cancelo': 0.88,
    'Ruben Dias': 0.88,
    'Bernardo Silva': 0.88,
    'Fabinho': 0.87,
    'Rodri': 0.87,
    'Andrew Robertson': 0.87,
    'Kalidou Koulibaly': 0.87,
    'Trent Alexander-Arnold': 0.87,
    'Hugo Lloris': 0.87,
    'De Gea': 0.87,
    'Edouard Mendy': 0.86,
    'Aymeric Laporte': 0.86,
    'Raheem Sterling': 0.86,
    'Riyad Mahrez': 0.86,
    'Thiago': 0.86,
    'Thiago Silva': 0.86,
    'Bruno Fernandes': 0.86,
    'İlkay Gundogan': 0.85,
    'Phil Foden': 0.85,
    'Diogo Jota': 0.85,
    'Jorginho': 0.85,
    'Kyle Walker': 0.85,
    'Jamie Vardy': 0.85,
    'Pierre-Emerick Aubameyang': 0.85,
    'Martin Odegaard': 0.84,
    'Mason Mount': 0.84,
    'Declan Rice': 0.84,
    'Jack Grealish': 0.84,
    'Ivan Perisic': 0.84,
    'JOEL MATIP': 0.84,
    'Kai Havertz': 0.84,
    'Kieran Trippier': 0.84,
    'Luis Diaz': 0.84,
    'Mateo Kovacic': 0.84,
    'Raphael Varane': 0.84,
    'Reece James': 0.84,
    'Thomas Partey': 0.84,
    'Wilfred Ndidi': 0.84,
    'Youri Tielemans': 0.84,
    'Cristian Romero': 0.83,
    'Diego Carlos': 0.83,
    'Gabriel Jesus': 0.83,
    'Hakim Ziyech': 0.83,
    'John Stones': 0.83,
    'Jordan Henderson': 0.83,
    'Pierre-Emile Hojbjerg': 0.83,
    'Ricardo Pereira': 0.83,
    'Roberto Firmino': 0.83,
    'Ruben Neves': 0.83,
    'Aaron Ramsdale': 0.82,
    'Antony': 0.82,
    'Ben Chilwell': 0.82,
    'Bukayo Saka': 0.82,
    'Azpilicueta': 0.82,
    'Christian Eriksen': 0.82,
    'Christian Pulisic': 0.82,
    'Darwin Nunez': 0.82,
    'Goncalo Guedes': 0.82,
    'Idrissa Gueye': 0.82,
    'James Maddison': 0.82,
    'Jordan Pickford': 0.82,
    'Lucas Digne': 0.82,
    'Lucas Paqueta': 0.82,
    'Łukasz Fabianski': 0.82,
    'Coutinho': 0.82,
    'Raul Jimenez': 0.82,
    'Wilfried Zaha': 0.82,
    'ALLAN SAINT-MAXIMIN': 0.81,
    'Alphonse Areola': 0.81,
    'Bruno Guimaraes': 0.81,
    'Dejan Kulusevski': 81,
    'Denis Zakaria': 81,
    'Gabriel': 81,
    'Harry Maguire': 81,
    'Ibrahima Konate': 81,
    'James Ward-Prowse': 81,
    'Joao Moutinho': 81,
    'Palhinha': 81,
    'Joe Gomez': 81,
    'Jose Sa': 81,
    'Kalvin Phillips': 81,
    'Kieran Tierney': 81,
    'Lisandro Martinez': 81,
    'Manuel Akanji': 81,
    'Cucurella': 81,
    'Marcus Rashford': 81,
    'Naby Keita': 81,
    'Nick Pope': 81,
    'Neto': 81,
    'Richarlison': 81,
    'Tomas Soucek': 81,
    'Yves Bissouma': 81,
    'Aaron Wan-Bissaka': 80,
    'Alexander Isak': 80,
    'Allan': 80,
    'Anthony Martial': 80,
    'Arthur': 80,
    'Bernd Leno': 80,
    'Boubacar Kamara': 80,
    'Dominic Calvert-Lewin': 80,
    'Emile Smith Rowe': 80,
    'Emiliano Buendia': 80,
    'Eric Dier': 80,
    'Fred': 80,
    'Gianluca Scamacca': 80,
    'Harvey Barnes': 80,
    'Jarrod Bowen': 80
}

# Add player weights to combined_data
combined_data['player_weight'] = combined_data['Player'].map(player_weights)


In [10]:
default_weight = 0.5

combined_data['player_weight'] = combined_data['Player'].map(player_weights).fillna(default_weight)

print(combined_data[['Player', 'player_weight']])


                Player  player_weight
0      Kevin De Bruyne           0.91
1              Ederson           0.89
2       Erling Haaland           0.88
3         Joao Cancelo           0.88
4           Ruben Dias           0.88
...                ...            ...
15150             Neto          81.00
15151             Neto          81.00
15152             Neto          81.00
15153             Neto          81.00
15154             Neto          81.00

[15155 rows x 2 columns]


In [11]:
team_weights = {
  'gf': 1.0,
  'sh': 0.7,
  'sot': 0.8,
  'poss': 0.6,
  'xg': 0.9,
  'fk': 0.3,
  'pk': 0.8,
  'pkatt': 0.2,
}



for col, weight in team_weights.items():
  combined_data[col] = combined_data[col] * weight

print(combined_data.head())


   Unnamed: 0        date   time            comp        round  day venue  \
0           1  2022-08-07  16:30  Premier League  Matchweek 1  Sun  Away   
1           1  2022-08-07  16:30  Premier League  Matchweek 1  Sun  Away   
2           1  2022-08-07  16:30  Premier League  Matchweek 1  Sun  Away   
3           1  2022-08-07  16:30  Premier League  Matchweek 1  Sun  Away   
4           1  2022-08-07  16:30  Premier League  Matchweek 1  Sun  Away   

  result   gf  ga  ...             Team   Tm  Rating  PAC  SHO PAS DRI DEF  \
0      W  2.0   0  ...  Manchester City  MCI      91   74   88  93  87  64   
1      W  2.0   0  ...  Manchester City  MCI      89   87   82  88  64  88   
2      W  2.0   0  ...  Manchester City  MCI      88   89   80  49  87  87   
3      W  2.0   0  ...  Manchester City  MCI      88   85   73  85  85  81   
4      W  2.0   0  ...  Manchester City  MCI      88   63   66  68  88  88   

  PHY  player_weight  
0  77           0.91  
1  88           0.89  
2  87

In [12]:
print(combined_data['result'])


0        W
1        W
2        W
3        W
4        W
        ..
15150    L
15151    W
15152    L
15153    L
15154    W
Name: result, Length: 15155, dtype: object


In [13]:
combined_data['result'] = combined_data['result'].map({'W': 1, 'L': 0})


In [14]:
#combined_data['result_binary'] = combined_data['result'].map({'W': 1, 'L': 0})
#y = combined_data['result_binary']


In [15]:
combined_data['result'] = combined_data['result'].astype(int)

In [16]:
print(combined_data['result'])


0        1
1        1
2        1
3        1
4        1
        ..
15150    0
15151    1
15152    0
15153    0
15154    1
Name: result, Length: 15155, dtype: int32


# Neural Network Feedforward Functional API

In [17]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Extract features and target variable
X = combined_data[['gf', 'sh', 'sot', 'poss', 'xg', 'fk', 'pk', 'pkatt', 'player_weight']]
y = combined_data['result']  # Use the 'won' column as the target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale the features
X_test_scaled = scaler.transform(X_test)

# Define the model architecture using the Functional API
num_features = X_train_scaled.shape[1]

input_layer = tf.keras.layers.Input(shape=(num_features,))
hidden1 = tf.keras.layers.Dense(128, activation='relu')(input_layer)
dropout1 = tf.keras.layers.Dropout(0.3)(hidden1)
hidden2 = tf.keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = tf.keras.layers.Dropout(0.3)(hidden2)
hidden3 = tf.keras.layers.Dense(32, activation='relu')(dropout2)
player_weight_input = tf.keras.layers.Input(shape=(1,))
merged = tf.keras.layers.Concatenate()([hidden3, player_weight_input])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

model = tf.keras.models.Model(inputs=[input_layer, player_weight_input], outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit([X_train_scaled, X_train[['player_weight']].values], y_train,
          epochs=10, batch_size=32,
          validation_data=([X_test_scaled, X_test[['player_weight']].values], y_test))

# Prediction
# For new predictions, you would preprocess the new data (let's call it new_data) in the same way as X_train
# Example:
# new_data_scaled = scaler.transform(new_data_features)
# predictions = model.predict([new_data_scaled, new_data_player_weight])


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9)]                  0         []                            
                                                                                                  
 dense (Dense)               (None, 128)                  1280      ['input_1[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 128)                  0         ['dense[0][0]']               
                                                                                                  
 dense_1 (Dense)             (None, 64)                   8256      ['dropout[0][0]']             
                                                                                              

<keras.src.callbacks.History at 0x266bb82a210>

In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [19]:
print(combined_data['team'].unique())
print(combined_data['team_opp'].unique())


['MCI' 'ARS' 'MUN' 'NEW' 'TOT' 'FUL' 'LIV' 'CHE' 'AVL' 'CRY' 'WOL' 'WHU'
 'EVE' 'BOU']
['West Ham' 'BOU' 'NEW' 'CRY' 'NFO' 'AVL' 'WOL' 'MUN' 'Southampton' 'LIV'
 'BRI' 'Leicester City' 'FUL' 'BRE' 'Leeds United' 'EVE' 'CHE' 'TOT' 'ARS'
 'Norwich City' 'BUR' 'Watford' 'SHU' 'West Brom' 'MCI']


Team Mapping to allow for Code Calling

In [22]:
# Function to retrieve team data
def get_team_data(team_name, combined_data):
    # Filter the DataFrame for rows matching the team name
    team_data_filtered = combined_data[(combined_data['team'] == team_name) | (combined_data['team_opp'] == team_name)]

    if team_data_filtered.empty:
        raise ValueError(f"No data found for team: {team_name}")

    numeric_columns = ['gf', 'sh', 'sot', 'poss', 'xg', 'fk', 'pk', 'pkatt', 'player_weight']
    team_data_numeric = team_data_filtered[numeric_columns]
    team_data_mean = team_data_numeric.mean()
    average_player_weight = combined_data[combined_data['Tm'] == team_name]['player_weight'].mean()
    return team_data_mean, average_player_weight

# Function to predict game outcome
def predict_game_outcome(team1_name, team2_name, combined_data, scaler, model):
    team1_data, team1_player_weight = get_team_data(team1_name, combined_data)
    team2_data, team2_player_weight = get_team_data(team2_name, combined_data)

    # Prepare features for scaling
    features_team1 = np.array([team1_data])
    features_team2 = np.array([team2_data])

    # Scale features
    features_team1_scaled = scaler.transform(features_team1)
    features_team2_scaled = scaler.transform(features_team2)

    # Reshape player weight for input
    team1_weight = np.array([[team1_player_weight]])
    team2_weight = np.array([[team2_player_weight]])

    # Predict using the model
    prediction_team1 = model.predict([features_team1_scaled, team1_weight])
    prediction_team2 = model.predict([features_team2_scaled, team2_weight])
    
    # Interpret the prediction for team 1
    if prediction_team1[0] > 0.5:
        return f"{team1_name} is predicted to win against {team2_name}"
    else:
        return f"{team2_name} is predicted to win against {team1_name}"


# User interaction and prediction call
team1 = input("Enter team one: ")
team2 = input("Enter team two: ")
prediction = predict_game_outcome(team1, team2, combined_data, scaler, model)
print(prediction)

def get_team_data(team_name, combined_data):
    # Filter the DataFrame for rows matching the team name
    team_data_filtered = combined_data[(combined_data['team'] == team_name) | (combined_data['team_opp'] == team_name)]

    if team_data_filtered.empty:
        raise ValueError(f"No data found for team: {team_name}")

    numeric_columns = ['gf', 'sh', 'sot', 'poss', 'xg', 'fk', 'pk', 'pkatt', 'player_weight']
    team_data_numeric = team_data_filtered[numeric_columns]
    team_data_mean = team_data_numeric.mean()
    average_player_weight = combined_data[combined_data['Tm'] == team_name]['player_weight'].mean()
    return team_data_mean, average_player_weight

# Function to predict game outcome
def predict_game_outcome(team1_name, team2_name, combined_data, scaler, model):
    team1_data, team1_player_weight = get_team_data(team1_name, combined_data)
    team2_data, team2_player_weight = get_team_data(team2_name, combined_data)

    features_team1 = [team1_data['gf'], team1_data['sh'], team1_data['sot'],
                      team1_data['poss'], team1_data['xg'], team1_data['fk'],
                      team1_data['pk'], team1_data['pkatt'], team1_player_weight]

    features_team2 = [team2_data['gf'], team2_data['sh'], team2_data['sot'],
                      team2_data['poss'], team2_data['xg'], team2_data['fk'],
                      team2_data['pk'], team2_data['pkatt'], team2_player_weight]

    input_features = np.array([features_team1, features_team2])
    input_features_scaled = scaler.transform(input_features)

    predictions = model.predict(input_features_scaled)

    # Interpret the prediction for team 1
    if predictions[0] > 0.5:
        return f"{team1_name} is predicted to win against {team2_name}"
    else:
        return f"{team2_name} is predicted to win against {team1_name}"


# User interaction and prediction call
team1 = input("Enter team one: ")
team2 = input("Enter team two: ")
prediction = predict_game_outcome(team1, team2, combined_data, scaler, model)
print(prediction)





LIV is predicted to win against MCI
