# JM0250 Data Visualization 
### Academic year 2022-2023

## FIFA World Cup 2022 Data Exploration
Data sources:

- FIFA World Cup 2022 Player Data (https://www.kaggle.com/datasets/swaptr/fifa-world-cup-2022-player-data)
- FIFA World Cup 2022 Match Data (https://www.kaggle.com/datasets/swaptr/fifa-world-cup-2022-match-data)
- FIFA World Cup 2022 Team Data (https://www.kaggle.com/datasets/swaptr/fifa-world-cup-2022-statistics)
- FIFA World Cup 2022 Twitter Dataset (https://www.kaggle.com/datasets/kumari2000/fifa-world-cup-twitter-dataset-2022)
- FIFA World Cup 2022 Prediction (https://www.kaggle.com/datasets/shilongzhuang/soccer-world-cup-challenge)
- FIFA World Cup 2022 Player Images (https://www.kaggle.com/datasets/soumendraprasad/fifa-2022-all-players-image-dataset)
- FIFA World Cup Historic (https://www.kaggle.com/datasets/piterfm/fifa-football-world-cup)
- FIFA World Cup Penalty Shootouts (https://www.kaggle.com/datasets/pablollanderos33/world-cup-penalty-shootouts, https://www.kaggle.com/datasets/jandimovski/world-cup-penalty-shootouts-2022)

Data dictionaries and additional info can be found in the respective data folders.

In [1]:
# Import libraries
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
import pprint as pp
pp.PrettyPrinter(indent=6)

# Do not truncate tables
pd.set_option('display.max_columns', None)

In [2]:
# Load the data

# Match data
df_match_data = pd.read_csv('../Data/FIFA World Cup 2022 Match Data/data.csv', delimiter=',')

# Player data
df_player_defense       = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_defense.csv', delimiter=',')
df_player_gca           = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_gca.csv', delimiter=',')
df_player_keepers       = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_keepers.csv', delimiter=',')
df_player_keepersadv    = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_keepersadv.csv', delimiter=',')
df_player_misc          = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_misc.csv', delimiter=',')
df_player_passing       = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_passing.csv', delimiter=',')
df_player_passing_types = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_passing_types.csv', delimiter=',')
df_player_playingtime   = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_playingtime.csv', delimiter=',')
df_player_possession    = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_possession.csv', delimiter=',')
df_player_shooting      = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_shooting.csv', delimiter=',')
df_player_stats         = pd.read_csv('../Data/FIFA World Cup 2022 Player Data/player_stats.csv', delimiter=',')

# Team data
df_team_data        = pd.read_csv('../Data/FIFA World Cup 2022 Team Data/team_data.csv', delimiter=',')
df_team_group_stats = pd.read_csv('../Data/FIFA World Cup 2022 Team Data/group_stats.csv', delimiter=',')

# Historic data
df_historic_fifa_ranking      = pd.read_csv('../Data/FIFA World Cup Historic/fifa_ranking_2022-10-06.csv', delimiter=',')
df_historic_matches_1930_2022 = pd.read_csv('../Data/FIFA World Cup Historic/matches_1930_2022.csv', delimiter=',')
df_historic_world_cup         = pd.read_csv('../Data/FIFA World Cup Historic/world_cup.csv', delimiter=',')

# Penalty shootouts
df_penalty_shootouts = pd.read_csv('../Data/FIFA World Cup Penalty Shootouts/WorldCupShootouts.csv', delimiter=',')

# Twitter data
df_tweets_01 = pd.read_csv('../Data/FIFA World Cup 2022 Twitter Dataset/tweets1.csv', delimiter=';')
df_tweets_02 = pd.read_csv('../Data/FIFA World Cup 2022 Twitter Dataset/tweets2.csv', delimiter=';')
df_tweets = pd.concat([df_tweets_01, df_tweets_02])

# Prediction data
df_prediction_groups  = pd.read_csv('../Data/FIFA World Cup 2022 Prediction/2022_world_cup_groups.csv', delimiter=',')
df_prediction_matches = pd.read_csv('../Data/FIFA World Cup 2022 Prediction/2022_world_cup_matches.csv', delimiter=',')
df_prediction_international_matches = pd.read_csv('../Data/FIFA World Cup 2022 Prediction/international_matches.csv', delimiter=',')
df_prediction_world_cup_matches = pd.read_csv('../Data/FIFA World Cup 2022 Prediction/world_cup_matches.csv', delimiter=',')
df_prediction_world_cups = pd.read_csv('../Data/FIFA World Cup 2022 Prediction/world_cups.csv', delimiter=',')

# Player images
def list_full_paths(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]

def img_reshape(img):
    img = Image.open(img).convert('RGB')
    img = img.resize((300,300))
    img = np.asarray(img)
    return img

def showImages(group, land, player):
    images  = list_full_paths('../Data/FIFA World Cup 2022 Player Images/Images/Images/Group ' + group + '/' + land + ' Players/Images_' + player)
    img_arr = []
    
    for image in images:
        img_arr.append(img_reshape(image))
        
    rows = 5
    cols = 5
    img_count = 0
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=((5,5)))
                             
    for i in range(rows):
        for j in range(cols):
            if img_count < len(img_arr):
                axes[i,j].imshow(img_arr[img_count])
                axes[i,j].axis('off')
                img_count+=1
                
    plt.subplots_adjust(wspace=0, hspace=0)
    

In [3]:
def null_description(df):
    data = {}
    cols = df.columns
    for col in cols:
        data[col] = df[col].isna().sum()
        data[f"{col}_null_ratio"] = df[col].isna().sum()/len(df)
    pp.pprint(data)

def name_of_null(df):
    cols = df.columns
    data = {}
    for col in cols:
        null_indices = df[df[col].isnull()].index.tolist()
        data[col+"_null_players"] = set(df.loc[null_indices,'player'].values)
    pp.pprint(data)

def types(df):
    cols = df.columns
    data = {}
    for col in cols:
        data[col+"_type"] = type(df.loc[0,col])
    pp.pprint(data)

In [4]:
l = ["df_player_defense",
"df_player_gca",
"df_player_keepers",
"df_player_keepersadv",
"df_player_misc",
"df_player_passing",
"df_player_passing_types",
"df_player_playingtime",
"df_player_possession",
"df_player_shooting",
"df_player_stats"]
df_list = [df_player_defense,df_player_gca,df_player_keepers,
df_player_keepersadv,df_player_misc,df_player_passing,
df_player_passing_types,df_player_playingtime,
df_player_possession,df_player_shooting, df_player_stats]
for i,df in enumerate(df_list):
    print(l[i])
    types(df)
    

df_player_defense
{'age_type': <class 'str'>,
 'birth_year_type': <class 'numpy.int64'>,
 'blocked_passes_type': <class 'numpy.float64'>,
 'blocked_shots_type': <class 'numpy.float64'>,
 'blocks_type': <class 'numpy.float64'>,
 'clearances_type': <class 'numpy.float64'>,
 'dribble_tackles_pct_type': <class 'numpy.float64'>,
 'dribble_tackles_type': <class 'numpy.float64'>,
 'dribbled_past_type': <class 'numpy.float64'>,
 'dribbles_vs_type': <class 'numpy.float64'>,
 'errors_type': <class 'numpy.float64'>,
 'interceptions_type': <class 'numpy.int64'>,
 'minutes_90s_type': <class 'numpy.float64'>,
 'player_type': <class 'str'>,
 'position_type': <class 'str'>,
 'tackles_att_3rd_type': <class 'numpy.float64'>,
 'tackles_def_3rd_type': <class 'numpy.float64'>,
 'tackles_interceptions_type': <class 'numpy.float64'>,
 'tackles_mid_3rd_type': <class 'numpy.float64'>,
 'tackles_type': <class 'numpy.float64'>,
 'tackles_won_type': <class 'numpy.int64'>,
 'team_type': <class 'str'>}
df_player_gc

## Preprocessing

In [5]:
import re

In [6]:
df_match_data['score'] = df_match_data['score'].astype(str)
def filter_val(score_string):
    res = None
    # # val = "(1) 2-3 (4)"
    # # val = "1-3"
    pattern = f"(\((\d+)\)\ )?((\d+){chr(8211)}(\d+))(\ \((\d+)\))?" 
    # take a note on this - can be 8211 or 45
    
    match = re.match(pattern, score_string)
    if match:
        groups = match.groups()
        _, score_penalty_home, _, score_home, \
        score_away, _, score_penalty_away = groups
        if score_penalty_away != None and score_penalty_home != None:
            score_penalty_away = int(score_penalty_away)
            score_penalty_home = int(score_penalty_home)
        res = score_penalty_home, int(score_home), int(score_away), score_penalty_away 
    else:
        print("No match founds")

    return res

In [7]:
len(df_player_defense)

680

In [9]:
df_match_data['match_time'].min(), df_match_data['match_time'].max()

('2022-11-20 19:00:00', '2022-12-18 18:00:00')

In [8]:
df_match_data.describe()

new_columns = ['score_penalty_home', 'score_home', 'score_away', 'score_penalty_away']
index = 0
while index < len(df_match_data):
    penalty_home, home, away, penalty_away = filter_val(df_match_data.loc[index, 'score'])
    df_match_data.loc[index, new_columns] = penalty_home, home, away, penalty_away
    index += 1

df_non_penalty_matches = df_match_data[df_match_data['score_home'] != df_match_data['score_away']]
display(df_non_penalty_matches.head(1))

fig = px.scatter(df_non_penalty_matches, x="home_possession", y="score_home",
                 color="venue", size="attendance",
                 width=1000, height=800)
fig.show()



Unnamed: 0,match,dayofweek,match_time,home_team,away_team,home_xg,away_xg,score,attendance,venue,referee,home_formation,away_formation,home_captain,away_captain,home_manager,away_manager,home_possession,away_possession,home_completed_passes,home_attempted_pases,away_completed_passes,away_attempted_pases,home_sot,away_sot,home_total_shots,away_total_shots,home_saves,away_saves,home_fouls,away_fouls,home_corners,away_corners,home_crosses,away_crosses,home_touches,away_touches,home_tackles,away_tackles,home_interceptions,away_interceptions,home_aerials_won,away_aerials_won,home_clearances,away_clearances,home_offsides,away_offsides,home_gks,away_gks,home_throw_ins,away_throw_ins,home_long_balls,away_long_balls,score_penalty_home,score_home,score_away,score_penalty_away
0,1,Sun,2022-11-20 19:00:00,Qatar,Ecuador,0.3,1.2,0–2,67372,Al Bayt Stadium,Daniele Orsato,5-3-2,4-4-2,Hassan Al-Haydos,Enner Valencia,Félix Sánchez,Gustavo Alfaro,47,53,377,469,430,522,0,2,5,5,1,0,15,15,1,3,8,14,549,601,10,14,2,12,14,13,18,7,3,4,4,7,20,17,51,70,,0.0,2.0,


In [None]:
print("Defence")
display(df_player_defense.columns)       
print("gca")
display(df_player_gca.columns)           
print("keepers")
display(df_player_keepers.columns)       
print("keepersadv")
display(df_player_keepersadv.columns)    
print("misc")
display(df_player_misc.columns)          
print("passing head")
display(df_player_passing.columns)       
print("passing type")
display(df_player_passing_types.columns) 
print("playingtime")
display(df_player_playingtime.columns)   
print("possession")
display(df_player_possession.columns)   
print("shooting")
display(df_player_shooting.columns)      
print("stats")
display(df_player_stats.columns)


In [None]:
# Penalty shootouts
display(df_penalty_shootouts.head(1))

In [None]:
# Team data
display(df_team_data.head(1))
display(df_team_group_stats.head(1))

In [None]:
# Historic data
display(df_historic_fifa_ranking.head(5))
display(df_historic_matches_1930_2022.head(1))
display(df_historic_world_cup.head(1))

In [None]:
showImages('A', 'Netherlands', 'Matthijs de Ligt')

In [None]:
df_tweets['Tweet Posted Time'].unique()

In [None]:
df_historic_matches_1930_2022

In [None]:
display(df_prediction_groups.head())
display(df_prediction_matches.head())
display(df_prediction_international_matches.head())
display(df_prediction_world_cup_matches.head())
display(df_prediction_world_cups.head())

# Multiprocessing sample code

In [10]:
from multiprocessing import Pool, Process
import os

def info(title):
    print(title)
    print(f"module name: {__name__}")
    print(f"parent process: {os.getppid()}")
    print(f"process id: {os.getpid()}")
def name_f(name):
    print(f'hello {name}')

def f(x):
    return x*x

if __name__=='__main__':
    with Pool(5) as p:
        print(p.map(f,[1,2,3]))
    
    p = Process(target=name_f, args=('bob',))
    p.start()
    p.join()


In [None]:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

const double learningRate = 0.01;
const int epochs = 1000;
const int features = 2;
const int classes = 2;

struct LogisticRegression {
  double** weights;
  double* bias;
};

void logisticRegression_initialize(LogisticRegression* lr) {
  lr->weights = malloc(classes * features * sizeof(double*));
  for (int i = 0; i < classes; i++) {
    lr->weights[i] = malloc(features * sizeof(double));
  }
  lr->bias = malloc(classes * sizeof(double));
  for (int i = 0; i < classes; i++) {
    for (int j = 0; j < features; j++) {
      lr->weights[i][j] = 0;
    }
    lr->bias[i] = 0;
  }
}

void logisticRegression_train(LogisticRegression* lr, double** x, double* y) {
  for (int epoch = 0; epoch < epochs; epoch++) {
    int correct = 0;
    for (int i = 0; i < x.size; i++) {
      int predictedClass = classify(lr, x[i]);
      if (predictedClass == y[i]) {
        correct++;
      }
      double error = (y[i] - predictedClass) / 2;
      for (int j = 0; j < features; j++) {
        lr->weights[y[i]][j] += learningRate * error * x[i][j];
      }
      lr->bias[y[i]] += learningRate * error;
    }
  }
}

int classify(LogisticRegression* lr, double* x) {
  double sum = 0;
  for (int i = 0; i < classes; i++) {
    double p = sigmoid(dotProduct(lr->weights[i], x) + lr->bias[i]);
    sum += p;
    if (sum >= 1) {
      return i;
    }
  }
  return 0;
}

double sigmoid(double x) {
  return 1 / (1 + exp(-x));
}

double dotProduct(double* w, double* x) {
  double sum = 0;
  for (int i = 0; i < x.size; i++) {
    sum += w[i] * x[i];
  }
  return sum;
}
