<a href="https://colab.research.google.com/github/JasaZnidar/Predvidenje-zmagovalca-vaterpolo/blob/mathematics-all3/Diplomska_naloga.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
# @title Imports
import json
import requests
from zipfile import ZipFile
from io import BytesIO
import numpy as np
import random

import warnings

# Support functions

In [2]:
# @title Event data to vector data
def eventToVector(event: dict) -> tuple[list[float], list[float]]:
  player1Statistic = [0.0]*11
  player2Statistic = [0.0]*11

  if "goal scored" in event['action']:
    # goals
    player1Statistic[0] += 1
    # shots
    player1Statistic[1] += 1
    # assists
    player2Statistic[2] += 1

  elif "exclusion" in event['action']:
    # exclusion
    player1Statistic[5] += 1

  elif "penalty foul" in event['action']:
    # penalty
    player1Statistic[6] += 1

  elif "shot missed" in event['action']:
    # shots
    player1Statistic[1] += 1

  elif "shot saved" in event['action']:
    # shots
    player1Statistic[1] += 1

  elif "shot blocked" in event['action']:
    # shots
    player1Statistic[1] += 1
    # blocks
    player2Statistic[3] += 1

  elif "suspention" in event['action']:
    # suspensions
    player1Statistic[7] += 1

  elif "brutality" in event['action']:
    # brutalities
    player1Statistic[8] += 1

  elif "sprint won" in event['action']:
    # sprint won
    player1Statistic[9] += 1
    # sprint
    player1Statistic[10] += 1
    # sprint
    player2Statistic[10] += 1

  return (player1Statistic, player2Statistic);

In [3]:
# @title Is second player ally
def isAlly(event: dict) -> bool:
  if 'goal scored' in event['action']:
    return True
  return False

In [4]:
# @title Update list
def Update(original: list, update: list) -> list:
  assert len(original) == len(update)

  return[original[x] + update[x] for x in range(len(original))]

# Data Setup

In [5]:
# @title Data settings
data_file = "test"
train_rate = 0.7
val_rate = 1.0 - train_rate

# goals, shots, assists, blocks, saves, exclusion, penalty foul, suspention, brutality, sprint won, sprints
used_features = [1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0]
num_playerInMatch_features = sum(used_features)
# birth, hand height, position, weight
used_player_stats = [1, 1, 1, 1, 1]
num_player_features = sum(used_player_stats)

In [6]:
# @title Get JSON data from github
with requests.get(f"https://github.com/JasaZnidar/totalwaterpolo-web-scraper/raw/master/{data_file}.zip", ) as r:
  ZipFile(BytesIO(r.content), "r").extractall()

In [7]:
# @title Extract data to json object
with open(f"/content/{data_file}.json") as f:
    scraped_data = json.load(f)

In [8]:
# @title Team and Match data
def getData(matches: dict[str, dict], stop: int = None, skipTie: bool = False):
  match_list = [match_id for match_id in matches]
  if stop is not None:
    match_list = match_list[:stop]

  match_data = {}
  team_data = {}

  for match_id in match_list:
    #=============================================================================
    # Collect match data
    #=============================================================================
    match_data[match_id] = [
        matches[match_id]['result']['away'] - matches[match_id]['result']['home'],
        matches[match_id]['name']['home'],
        matches[match_id]['name']['away'],
        [],
        []
    ]

    #=============================================================================
    # Check if ties are skipped
    #=============================================================================
    if skipTie and match_data[match_id][0] == 0:
      del match_data[match_id]
      continue

    #=============================================================================
    # Prepare statistics dictionary for a teams players
    #=============================================================================
    statistics = {
        'home': {int(x): [0.0]*11 for x in matches[match_id]['lineup']['home']},
        'away': {int(x): [0.0]*11 for x in matches[match_id]['lineup']['away']}
    }

    if len(statistics['home']) < 7 or len(statistics['away']) < 7:
      del match_data[match_id]
      continue

    #=============================================================================
    # Add missing teams in team_data
    #=============================================================================
    for team in ['home', 'away']:
      if not matches[match_id]['name'][team] in team_data:
        team_data[matches[match_id]['name'][team]] = [[0.0]*11, 0]

    #=============================================================================
    # Loop through all events and update player statistics
    #=============================================================================
    for event in matches[match_id]['plays']:
      num_1 = event['player_1']
      num_2 = event['player_2']
      primary_team = event['team']
      secondary_team = primary_team if isAlly(event) else 'away' if primary_team == 'home' else 'home'

      # no player was recorded for this event
      if num_1 == 0:
        continue

      # no secondary player was recorded for this event
      elif num_2 == 0:
        data_1, _ = eventToVector(event)
        statistics[primary_team][num_1] = Update(statistics[primary_team][num_1], data_1)

      # there are 2 players recorded for this event
      else:
        data_1, data_2 = eventToVector(event)
        statistics[primary_team][num_1] = Update(statistics[primary_team][num_1], data_1)
        statistics[secondary_team][num_2] = Update(statistics[secondary_team][num_2], data_2)

    #=============================================================================
    # Update match data
    #=============================================================================
    match_data[match_id][3] = team_data[matches[match_id]['name']['home']]
    match_data[match_id][4] = team_data[matches[match_id]['name']['away']]

    #=============================================================================
    # Update team overall data
    #=============================================================================
    for team in ['home', 'away']:
      team_name = matches[match_id]['name'][team]

      for player in statistics[team]:
        team_data[team_name][0] = Update(team_data[team_name][0], statistics[team][player])
        team_data[team_name][1] += 1

  return (match_data, team_data)

match_data, team_data = getData(scraped_data['matches'])
noTieMatch_data, noTieTeam_data = getData(scraped_data['matches'], skipTie=True)

# Models

In [9]:
# @title Statistics
class Statistics:
  def __init__(self, match_data: dict[str, list[int, str, str, list[float], list[float]]]):
    self.match_data = match_data
    self.resultStats = [0, 0, 0] # from home perspective: [win, tie, loss]

    for match_id in scraped_data['matches']:
      lineup = scraped_data['matches'][match_id]['lineup']
      if len(lineup['home']) < 7 or len(lineup['away']) < 7:
        continue

      diff = scraped_data['matches'][match_id]['result']['home'] - scraped_data['matches'][match_id]['result']['away']
      if diff > 0:
        self.resultStats[0] += 1
      elif diff < 0:
        self.resultStats[2] += 1
      else:
        self.resultStats[1] += 1

  def Predict(self):
    stats = [
        self.resultStats[0] / sum(self.resultStats),
        self.resultStats[1] / sum(self.resultStats),
        self.resultStats[2] / sum(self.resultStats)
    ]
    values = [1, 0, -1]

    return values[np.argmax(stats)]



In [10]:
# @title KNN
class KNN:
  def __init__(self, k: int, team_data: dict[str, list[list[float], int]], match_history: dict[str, list[int, str, str, list[float], list[float]]]):
    self.k = k

    teams = list(team_data.keys())
    self.team_data = {
        team: np.array(team_data[team][0])/team_data[team][1]
        for team in team_data
    }
    self.match_history = match_history

  def Predict(self, friendly_team: str, enemy_team: str, friendly_home: bool = None) -> float:
    if friendly_team not in self.team_data and enemy_team not in self.team_data:
      # both teams are unknown, return tie
      return 0

    if enemy_team not in self.team_data:
      # enemy team has no history, reverse the question and then the answer
      return -self.Predict(enemy_team, friendly_team, not friendly_home)

    # collect all teams who played against the enemy team
    neighbors = set([])
    neighbor_results = {}
    for match_id in self.match_history:
      history_match = self.match_history[match_id]

      # friendly team is not home team => enemy team is home
      if friendly_home != True and history_match[1] == enemy_team and history_match[2] != friendly_team:
        neighbors.add(history_match[2])
        if history_match[2] not in neighbor_results:
          neighbor_results[history_match[2]] = []
        neighbor_results[history_match[2]].append(history_match[0])

      # friendly team is not away team => enemy team is away
      if friendly_home != False and history_match[2] == enemy_team and history_match[1] != friendly_team:
        neighbors.add(history_match[1])
        if history_match[1] not in neighbor_results:
          neighbor_results[history_match[1]] = []
        neighbor_results[history_match[1]].append(history_match[0])

    # form a neighbor matrix for distnce
    neighbor_matrix = np.zeros((len(neighbors), 11))
    neighbor_index = []
    for neighbor in neighbors:
      neighbor_matrix[len(neighbor_index), :] = self.team_data[neighbor]
      neighbor_index.append(neighbor)

    # calculate distance
    if friendly_team not in self.team_data:
      neighbor_distance = (neighbor_matrix)**2
    else:
      neighbor_distance = (neighbor_matrix - self.team_data[friendly_team])**2
    neighbor_distance = np.sum(neighbor_distance, axis=1)

    # select k nearest neighbors, if possible
    modifier = 1
    if len(neighbor_distance) < self.k:
      knn_index = [n for n in range(len(neighbor_distance))]
      modifier = len(neighbor_distance)/self.k  # normalize data to match uncertanty
      if len(neighbor_distance) == 0:
        # enemy team has no history, reverse the question and then the answer
        return -self.Predict(enemy_team, friendly_team, not friendly_home)
      #warnings.warn("Not enough neighbors to make a full prediction. Prediction will be made with {} neighbors instead of {}.".format(len(neighbor_distance), self.k))
    elif len(neighbor_distance) == self.k:
      knn_index = [n for n in range(len(neighbor_distance))]
    else:
      knn_index = np.argpartition(neighbor_distance, self.k)[:self.k]

    # calculate prediction
    pred = 0.0
    weight = 0.0
    for i in knn_index:
      for result in neighbor_results[neighbor_index[i]]:
        pred += result * (1/neighbor_distance[i])
        weight += 1/neighbor_distance[i]
    pred /= weight

    return pred

In [11]:
# @title Linear regresion
def gradient(X: np.ndarray, y: np.ndarray, theta: np.ndarray, l2_lambda: float) -> np.array:
    _, m = X.shape
    L2 = np.ones((m,)) * l2_lambda
    L2[0] = 0.0

    return np.dot(np.dot(X, theta) - y, X) + L2*theta


def gradient_descent(X: np.ndarray, y: np.ndarray, l2_lambda: float, lr=0.01, tol=1e-6, max_iter=100_000) -> np.array:
    _, m = X.shape
    theta = np.ones((m, ))

    for _ in range(max_iter):
        theta_next = theta - lr*gradient(X, y, theta, l2_lambda)

        if np.isinf(theta_next).any() or np.isnan(theta_next).any():
            return theta

        diff = np.linalg.norm(gradient(X, y, theta_next, l2_lambda))
        if np.isinf(diff):
          raise Exception("Diff is inf.")
        if np.isnan(diff):
          raise Exception("Diff is nan.")

        if diff < tol:
            return theta_next

        theta = theta_next

    return theta

class LinReg:
  def __init__(self, match_history: dict[str, list[int, str, str, list[float], list[float]]]):
    # prepare learning data
    X = np.zeros((len(match_history), 22))
    y = np.zeros((len(match_history)))

    # fill out learning data
    match_list = list(match_history.keys())
    for i in range(len(match_list)):
      match_id = match_list[i]
      home_data = np.array(match_history[match_id][3][0])/match_history[match_id][3][1]
      away_data = np.array(match_history[match_id][4][0])/match_history[match_id][4][1]

      X[i, :] = np.concatenate((home_data, away_data))
      y[i] = match_history[match_id][0] # result

    # add coeficient to X
    X_ = np.concatenate((np.ones((len(match_history), 1)), X), axis=1)

    # gradient descent
    theta = gradient_descent(X_, y, 0.0, 1e-5, 1e-8, 1_000_000)

    self.coefs = theta[1:]
    self.intercept = theta[0]

  def Predict(self, home_team: str, away_team: str, team_data: dict[str, list[float]]):
    if home_team not in team_data or away_team not in team_data:
      home = np.zeros((11,))
    else:
      home = np.array(team_data[home_team][0])/team_data[home_team][1]

    if away_team not in team_data:
      away = np.zeros((11,))
    else:
      away = np.array(team_data[away_team][0])/team_data[away_team][1]

    home = np.reshape(home, (1, -1))
    away = np.reshape(away, (1, -1))

    X_ = np.concatenate((np.ones((1, 1)), home, away), axis=1)
    theta = np.insert(self.coefs, 0, self.intercept)

    return np.dot(X_, theta)[0]

# Train and test

In [12]:
N = 100

In [13]:
# @title Classification
def classify(val: float, allowTie: bool = True):
  if allowTie:
    if val >= 0.5:
      return 1
    elif val <= -0.5:
      return -1
    return 0
  else:
    if val >= 0:
      return 1
    return -1

In [14]:
# @title Prepare train and test data
train_match_data, train_team_data = getData(scraped_data['matches'], -N)
train_noTieMatch_data, train_noTieTeam_data = getData(scraped_data['matches'], -N, True)

test_match_data = {}
for match_id in match_data:
  if match_id not in train_match_data:
    test_match_data[match_id] = match_data[match_id]
ground = [classify(test_match_data[match_id][0]) for match_id in test_match_data]

test_noTieMatch_data = {}
for match_id in noTieMatch_data:
  if match_id not in train_noTieMatch_data:
    test_noTieMatch_data[match_id] = noTieMatch_data[match_id]
noTieGround = [classify(test_noTieMatch_data[match_id][0]) for match_id in test_noTieMatch_data]

In [15]:
# @title Statistics
# with ties
stats = Statistics(train_match_data)
stats_pred = []

for match_id in test_match_data:
  stats_pred.append(classify(stats.Predict()))

print("Ties included:", np.average(np.array(stats_pred) == np.array(ground)))

# without ties
noTieStats = Statistics(train_noTieMatch_data)
noTieStats_pred = []

for match_id in test_noTieMatch_data:
  noTieStats_pred.append(classify(noTieStats.Predict(), False))

print("Ties not included:", np.average(np.array(noTieStats_pred) == np.array(noTieGround)))

Ties included: 0.4897959183673469
Ties not included: 0.5052631578947369


In [16]:
# @title KNN - 3
# with ties
knn3 = KNN(3, train_team_data, train_match_data)
knn3_pred = []

for match_id in test_match_data:
  knn3_pred.append(classify(knn3.Predict(test_match_data[match_id][1], test_match_data[match_id][2], True)))

print("Ties included:", np.average(np.array(knn3_pred) == np.array(ground)))

# without ties
noTieKnn3 = KNN(3, train_noTieTeam_data, train_noTieMatch_data)
noTieKnn3_pred = []

for match_id in test_noTieMatch_data:
  noTieKnn3_pred.append(classify(noTieKnn3.Predict(test_noTieMatch_data[match_id][1], test_noTieMatch_data[match_id][2], True), False))

print("Ties not included:", np.average(np.array(noTieKnn3_pred) == np.array(noTieGround)))

Ties included: 0.5510204081632653
Ties not included: 0.6210526315789474


In [17]:
# @title Linear regression
# with ties
linreg = LinReg(train_match_data)
linreg_pred = []

for match_id in test_match_data:
  linreg_pred.append(classify(linreg.Predict(test_match_data[match_id][1], test_match_data[match_id][2], team_data)))

print("Ties included:", np.average(np.array(linreg_pred) == np.array(ground)))

# without ties
noTieLinreg = LinReg(train_match_data)
noTieLinreg_pred = []

for match_id in test_noTieMatch_data:
  noTieLinreg_pred.append(classify(linreg.Predict(test_noTieMatch_data[match_id][1], test_noTieMatch_data[match_id][2], train_noTieTeam_data), False))

print("Ties not included:", np.average(np.array(noTieLinreg_pred) == np.array(noTieGround)))

Ties included: 0.6938775510204082
Ties not included: 0.6947368421052632
