<a href="https://colab.research.google.com/github/JasaZnidar/Predvidenje-zmagovalca-vaterpolo/blob/main/Diplomska_naloga.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup enviroment and imports

## Setup

In [None]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
!pip install torch-geometric
!pip install scikit-plot

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/pyg_lib-0.4.0%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_scatter-2.1.2%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_sparse-0.6.18%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_cluster
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_cluster-1.6.3%2Bp

## Imports

In [None]:
import json
from urllib.request import urlopen
import networkx as nx
import torch
import torch_geometric
from torch_geometric.utils.convert import from_networkx
from torch_geometric import nn
from torch_geometric.data import HeteroData
from torch_geometric import transforms as T
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import scikitplot as skplt
import matplotlib.pyplot as plt
%matplotlib inline

## Other

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Analizing scraped data and create graph



## Get scraped data from github repository

In [None]:
# open raw data scraped from the website
with urlopen("https://raw.githubusercontent.com/JasaZnidar/totalwaterpolo-web-scraper/523ea70f7c183c38866dc346807f7b59f35b539a/data.json") as f:
    scraped_data = json.load(f)

## Organize data

In [None]:
players = {}  # player_id: {total data (not per match)}
teams = {}    # team_name: {wins, matches}
matches = {}  # match_id: {home_team_name, away_team_name, winner [home, away], home_lineup, away_lineup}

for competition in scraped_data['competitions']:
  for game in competition['matches']:
    matches[game['id']] = {
        'home': game['teams']['home'],
        'away': game['teams']['away'],
        'winner': "home" if game['result']['home'] > game['result']['away'] else "away" if game['result']['home'] < game['result']['away'] else "tie",
        'home_lineup': [],
        'away_lineup': []
    }

    #print(json.dumps(game['lineup'], sort_keys=True, indent=4))
    for team in ['away', 'home']:
      # add new team
      if not game['teams'][team] in teams:
        teams[game['teams'][team]] = {
            'wins': 1 if matches[game['id']]['winner'] == team else 0,
            'matches': 1
        }
      else:
        # update team stats
        teams[game['teams'][team]]['matches'] += 1
        if matches[game['id']]['winner'] == team:
          teams[game['teams'][team]]['wins'] += 1

      for number in game['lineup'][team]:
        try:
          id = game['lineup'][team][number]['id']
        except KeyError:
          # it's a goalgeeker
          continue

        # add player to match lineup list
        matches[game['id']][f"{team}_lineup"].append(id)

        # check if it's a new player
        if not id in players:
          players[id] = {
              'name': game['lineup'][team][number]['name'],
              'goals': 0,
              'shots': 0,
              'assists': 0,
              'blocks': 0,
              'played': 1,
              'saves': 0,
              'exclusions': 0,
              'penalties': 0,
              'suspensions': 0,
              'brutalities': 0,
              'sprints': 0,
              'sprints_won': 0
          }
        else:
          players[id]['played'] += 1

    for play in game['plays']:
      # check if a player was marked
      if play['player_1'] == 0:
        continue

      # find teams
      team_1 = play['team']
      team_2 = "home" if team_1 == "away" else "away"
      # check if player is a goalkeeper (IGNORE FOR NOW)
      if not 'id' in game['lineup'][team_1][str(play['player_1'])]:
        continue

      # find players who participated in the play
      id_1 = game['lineup'][team_1][str(play['player_1'])]['id']
      id_2 = [0, 0] # depending on the play, the second player could be from the same team (first value is the opposing team)
      if not play['player_2'] == 0:
        id_2[0] = game['lineup'][team_2][str(play['player_2'])]['id']
        id_2[1] = game['lineup'][team_1][str(play['player_2'])]['id']

      # detect play type
      if "goal scored" in play['action']:
        players[id_1]['shots'] += 1
        players[id_1]['goals'] += 1

        # was there an assist
        if not id_2[1] == 0:
          players[id_2[1]]['assists'] += 1
      elif "exclusion" in play['action']:
        players[id_1]['exclusions'] += 1
      elif "penalty foul" in play['action']:
        players[id_1]['penalties'] += 1
      elif "shot missed" in play['action']:
        players[id_1]['shots'] += 1
      elif "shot saved" in play['action']:
        players[id_1]['shots'] += 1
      elif "shot blocked" in play['action']:
        players[id_1]['shots'] += 1
        if not id_2[0] == 0:
          players[id_2[0]]['blocks'] += 1
      elif "suspention" in play['action']:
        players[id_1]['suspensions'] += 1
      elif "brutality" in play['action']:
        players[id_1]['brutalities'] += 1
      elif "sprint won" in play['action']:
        players[id_1]['sprints_won'] += 1
        players[id_1]['sprints'] += 1

        # other player sprinting for the ball
        if not id_2[0] == 0:
          players[id_2[0]]['sprints'] += 1

# Prepare data for learning

Some code in this and the Machine learning section was written based on [this article](https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70).

In [None]:
data = HeteroData()

## Normalize data and update it to be per match

In [None]:
normalize = {
      'goals': {
          'min': 1000,
          'max': 0
      },
      'shots': {
          'min': 1000,
          'max': 0
      },
      'assists': {
          'min': 1000,
          'max': 0
      },
      'blocks': {
          'min': 1000,
          'max': 0
      },
      'saves': {
          'min': 1000,
          'max': 0
      },
      'exclusions': {
          'min': 1000,
          'max': 0
      },
      'penalties': {
          'min': 1000,
          'max': 0
      },
      'suspensions': {
          'min': 1000,
          'max': 0
      },
      'brutalities': {
          'min': 1000,
          'max': 0
      },
      'sprints': {
          'min': 1000,
          'max': 0
      },
      'matches': {
          'min': 1000,
          'max': 0
      }
  }

### Player data to per match and get min and max values for normalization

In [None]:
perMatch = {}

for player_id in players:
  perMatch[player_id] = {
      'goals': players[player_id]['goals']/players[player_id]['played'],
      'shots': players[player_id]['shots']/players[player_id]['played'],
      'assists': players[player_id]['assists']/players[player_id]['played'],
      'blocks': players[player_id]['blocks']/players[player_id]['played'],
      'saves': players[player_id]['saves']/players[player_id]['played'],
      'exclusions': players[player_id]['exclusions']/players[player_id]['played'],
      'penalties': players[player_id]['penalties']/players[player_id]['played'],
      'suspensions': players[player_id]['suspensions']/players[player_id]['played'],
      'brutalities': players[player_id]['brutalities']/players[player_id]['played'],
      'sprints': 0.0 if players[player_id]['sprints'] == 0 else players[player_id]['sprints won']/players[player_id]['sprints'],
      'matches': players[player_id]['played'],
  }

  # get min and max for normalization
  for key in normalize:
    normalize[key]['min'] = min(normalize[key]['min'], perMatch[player_id][key])
    normalize[key]['max'] = max(normalize[key]['max'], perMatch[player_id][key])

### Normalize data

In [None]:
norm = {}

for player_id in perMatch:
  norm[player_id] = {}
  for key in perMatch[player_id]:
    if normalize[key]['min'] == normalize[key]['max']:
      norm[player_id][key] = 0.0
    else:
      norm[player_id][key] = (float(perMatch[player_id][key]) - float(normalize[key]['min']))/(float(normalize[key]['min']) - float(normalize[key]['max']))

### Calculate team win/loss ratio

In [None]:
WL_ratio = {}

for team in teams:
  WL_ratio[team] = float(teams[team]['wins'])/float(teams[team]['matches'])

## Load data into HeteroData()

### Player data

In [None]:
player_list = list(norm.keys())
#player_matrix = torch.tensor([[norm[player_list[p]][list(norm[player_id].keys())[s]] for s in range(11)] for p in range(len(norm))])  # [len(norm), 11]

player_matrix = torch.empty(0, 11, dtype=torch.int32)

for p in range(len(player_list)):
  player_id = player_list[p]
  stat_list = list(norm[player_id].keys())

  player_matrix = torch.cat((player_matrix, torch.zeros((1, 11), dtype=torch.int32)), 0)
  for s in range(len(norm[player_id])):
    player_matrix[-1, s] = norm[player_id][stat_list[s]]

data['player'].x = player_matrix

### Match data

In [None]:
teams_matrix = torch.empty(0, 2, dtype=torch.int32)
result_matrix = torch.empty(2, 0, dtype=torch.int64)
result_attr = torch.empty(1, 0, dtype=torch.float32)
"""home_matrix = torch.empty(2, 0)
away_matrix = torch.empty(2, 0)
tie_matrix = torch.empty(2, 0)"""
played_matrix = torch.empty(2, 0, dtype=torch.int64)
match_list = list(matches.keys())

for match in matches:
  # home team
  i_home = teams_matrix.size(dim=0)
  teams_matrix = torch.cat((teams_matrix, torch.zeros((1, 2), dtype=torch.int32)))
  teams_matrix[-1, 0] = WL_ratio[matches[match]['home']]
  teams_matrix[-1, 1] = 0.0

  # home lineup
  for player in matches[match]['home_lineup']:
    played_matrix = torch.cat((played_matrix, torch.zeros((2, 1), dtype=torch.int32)), dim=1)
    played_matrix[0, -1] = player_list.index(player)
    played_matrix[1, -1] = i_home
    pass

  # away team
  i_away = teams_matrix.size(dim=0)
  teams_matrix = torch.cat((teams_matrix, torch.zeros((1, 2), dtype=torch.int32)))
  teams_matrix[-1, 0] = WL_ratio[matches[match]['away']]
  teams_matrix[-1, 1] = 1.0

  # away lineup
  for player in matches[match]['away_lineup']:
    played_matrix = torch.cat((played_matrix, torch.zeros((2, 1), dtype=torch.int32)), dim=1)
    played_matrix[0, -1] = player_list.index(player)
    played_matrix[1, -1] = i_away
    pass

  # result relation
  result_matrix = torch.cat((result_matrix, torch.zeros((2, 1), dtype=torch.int32)), dim=1)
  result_matrix[0, -1] = i_home
  result_matrix[1, -1] = i_away
  if matches[match]['winner'] == "home":
    """home_matrix = torch.cat((home_matrix, torch.zeros((2, 1))), dim=1)
    home_matrix[0, -1] = i_home
    home_matrix[1, -1] = i_away"""
    result_attr = torch.cat((result_attr, torch.zeros((1, 1), dtype=torch.float32)), dim=1)
    result_attr[0, -1] = 0.0
  elif matches[match]['winner'] == "away":
    """away_matrix = torch.cat((away_matrix, torch.zeros((2, 1))), dim=1)
    away_matrix[0, -1] = i_away
    away_matrix[1, -1] = i_home"""
    result_attr = torch.cat((result_attr, torch.zeros((1, 1), dtype=torch.float32)), dim=1)
    result_attr[0, -1] = 1.0
  else:
    """tie_matrix = torch.cat((tie_matrix, torch.zeros((2, 1))), dim=1)
    tie_matrix[0, -1] = i_home
    tie_matrix[1, -1] = i_away"""
    result_attr = torch.cat((result_attr, torch.zeros((1, 1), dtype=torch.float32)), dim=1)
    result_attr[0, -1] = 0.5

data['team'].x = teams_matrix
"""data['team', 'home', 'team'].edge_index = home_matrix
data['team', 'away', 'team'].edge_index = away_matrix
data['team', 'tie', 'team'].edge_index = tie_matrix"""
data['team', 'result', 'team'].edge_index = result_matrix
data['team', 'result', 'team'].edge_attr = result_attr
data['player', 'play', 'team'].edge_index = played_matrix

print(data)

HeteroData(
  player={ x=[2551, 11] },
  team={ x=[1236, 2] },
  (team, result, team)={
    edge_index=[2, 618],
    edge_attr=[1, 618],
  },
  (player, play, team)={ edge_index=[2, 13827] }
)


### Final adjustments

In [None]:
# make undirected
data_undirected = T.ToUndirected()(data)

"""neighbors = {
    'match': 1,
    'team': 1,
    'player': 11,
    'goalkeeper': 2,
    'played': 100
}"""
neighbors = {
    ("team", "result", "team"): [14, 1, 20],
}
loader = LinkNeighborLoader(
    data,
    num_neighbors=neighbors,
    batch_size=128,
    edge_label_index=(("team", "result", "team"), data['team', 'result', 'team'].edge_index)
)

#print(next(iter(loader)))

# split data into training, validation and testing
transform = T.RandomLinkSplit(
    num_val=0.3,
    num_test=0.0,
    is_undirected=True,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("team", "result", "team")
)
train_data, val_data, test_data = transform(data_undirected)

# update data to include neighbors
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[11],
    neg_sampling_ratio=2.0,
    edge_label_index=(("team", "result", "team"), train_data["team", "result", "team"].edge_label_index),
    edge_label=train_data["team", "result", "team"].edge_label,
    batch_size=128,
    shuffle=True
)
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[11],
    edge_label_index=(("team", "result", "team"), val_data["team", "result", "team"].edge_label_index),
    edge_label=val_data["team", "result", "team"].edge_label,
    batch_size=3*128,
    shuffle=False
)

print(next(iter(loader)))

ValueError: Missing number of neighbors for edge type '('player', 'play', 'team')'

# Machine learning

## GNN module

In [None]:
class GNN(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.conv1 = nn.GCNConv(hidden_channels, hidden_channels)
    self.conv2 = nn.GCNConv(hidden_channels, hidden_channels)
    self.linear1 = torch.nn.Linear(hidden_channels,1)

  def forward(self, x, edge_index):
    x = torch.nn.functional.relu(self.conv1(x, edge_index))
    x = self.conv2(x, edge_index)
    x = self.linear1(x)

    return x

## GCN

In [None]:
class GCN(torch.nn.Module):
  def __init__(self, hidden_channels, conv_layers=2):
    super().__init__()
    self.conv = []
    for _ in range(conv_layers):
      self.conv.append(nn.RGCNConv(hidden_channels, hidden_channels))
    self.linear = torch.nn.Linear(hidden_channels, 1)

  def forward(self, x, edge_index):
    x = self.conv[0](x, edge_index)
    for i in range(1, len(self.conv)):
      x = torch.nn.functional.relu(x)
      x = self.conv[i](x, edge_index)

    x = self.linear(x)

    return x

## Classifer
Used to create edge-level prediction

In [None]:
class Classifier(torch.nn.Module):
  def forward(self, x_home, x_away, edge_label_index):
    edge_feat_home = x_home[edge_label_index[0]]
    edge_feat_away = x_away[edge_label_index[1]]
    print((edge_feat_home * edge_feat_away))

    return (edge_feat_home * edge_feat_away).sum(dim=1)

## Model

In [None]:
class Model(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()

    self.team_emb = torch.nn.Embedding(data['team'].num_nodes, hidden_channels)
    self.player_emb = torch.nn.Embedding(data['player'].num_nodes, hidden_channels)

    self.gnn = GNN(hidden_channels)

    self.classifier = Classifier()
  def forward(self, data):
    """x_dict = {
      'team': self.team_emb(data['team'].n_id),
      'player': self.player_emb(data['player'].n_id)
    }"""

    x = self.gnn(data['team'].x, data.edge_index_dict)
    """pred = self.classifier(
        x_dict['team'],
        x_dict['team'],
        data['team', 'result', 'team'].edge_label_index
    )"""

    pred = self.linear1(x)
    return pred

## Training

In [None]:
model = Model(64)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(16):
  total_loss = total_examples = 0
  for sampled_data in tqdm.tqdm(train_loader):
    print(sampled_data)
    optimizer.zero_grad()
    sampled_data.to(device)
    pred = model(sampled_data)
    ground_truth = sampled_data['team', 'result', 'team'].edge_label
    loss = torch.nn.functional.binary_cross_entropy_with_logits(pred, ground_truth)
    loss.backward()
    optimizer.step()
    total_loss += float(loss) * pred.numel()
    total_examples += pred.numel()

  print(f"Epoch: {epoch+1:03d}, Loss: {total_loss / total_examples:.4f}")

## Validate

In [None]:
preds = []
ground_truths = []
sampled_data = next(iter(val_loader))
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data['team', 'result', 'team'].edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
fpr, tpr, _ = roc_curve(ground_truth, pred)

print()
plt.plot([0, 1], [0, 1], color="red", lw=2, linestyle="--")
plt.plot(fpr,tpr, color="navy")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(f"Validation AUC: {auc:.4f}")