In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive
%cd 'Colab Notebooks'
%cd 'nlp'

/content/drive/MyDrive
/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/Colab Notebooks/nlp


In [None]:
!pip install stanza
import stanza
# Import client module
from stanza.server import CoreNLPClient

In [None]:
import pandas as pd
from collections import defaultdict
from heapq import nlargest
import string
import nltk
from nltk.tree import ParentedTree
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datetime import datetime as dt
import re

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv("cleaned-dataset.tsv", sep='\t')

In [None]:
# extract season fromm match data
def process_season(date):
  dt_obj = dt.strptime(date, "%b-%y")
  m, y = dt_obj.month, dt_obj.year-2000
  # games before june in a year are from season that started in prev. calendar year
  if m < 6:
    res = f"{y-1}/{y}"
  else:
    res = f"{y}/{y+1}"
  return res

df["Season"] = df["Date"].apply(process_season)

In [None]:
all_teams = set(df["HomeTeam"].unique()).union(set(df["AwayTeam"].unique()))

In [None]:
df.to_csv("cleaned-dataset.tsv", sep='\t')

In [None]:
# pre-processing
def process_text(text):
  stopwords = nltk.corpus.stopwords.words('english')
  text = text.lower()
  text = "".join([i for i in text if i not in string.punctuation])
  text = text.split(' ')
  text = [i for i in text if i not in stopwords]
  return text

In [None]:
# datasets containing player names for each season
# used to build key-term dictionaries

def fifa_dataset_path(i):
  return f"fifa-dataset/Fifa{i}-Players.csv"

player_df_13 = pd.read_csv(fifa_dataset_path(13))
player_df_14 = pd.read_csv(fifa_dataset_path(14))
player_df_15 = pd.read_csv(fifa_dataset_path(15))
player_df_16 = pd.read_csv(fifa_dataset_path(16))
player_df_17 = pd.read_csv(fifa_dataset_path(17))

In [None]:
# make sure club names match
diff_names = set()
for player_df in [player_df_15, player_df_16, player_df_17]:
  
  team_translate = {"West Ham":"West Ham United", "QPR":"Queens Park Rangers", "Newcastle Utd":"Newcastle United",
                    "Cardiff City":"Cardiff", "Spurs":"Tottenham Hotspur", "Manchester Utd":"Manchester United"}
  player_df["Club"] = player_df["Club"].replace(team_translate)
  for team in all_teams: 
    if not len(player_df["Name"][player_df["Club"] == team]):
      diff_names.add(team)

In [None]:
# make sure club names match
diff_names = set()
for player_df in [player_df_13, player_df_14]:
  
  team_translate = {"West Bromwich Albion":"West Brom", "Huddersfield Town":"Huddersfield",
                    "Brighton &amp; Hove Albion":"Brighton", "Cardiff City":"Cardiff"}
  player_df["Club"] = player_df["Club"].replace(team_translate)
  for team in all_teams: 
    if not len(player_df["Name"][player_df["Club"] == team]):
      diff_names.add(team)

In [None]:
squads_13_14 = {}
for team in all_teams:
  squads_13_14[team] = list(player_df_13["Name"][player_df_13["Club"] == team])

with open("squads_13_14.json", "w") as f:
  json.dump(squads_13_14, f)

In [None]:
squads_14_15 = {}
for team in all_teams:
  squads_14_15[team] = list(player_df_14["Name"][player_df_14["Club"] == team])

with open("squads_14_15.json", "w") as f:
  json.dump(squads_14_15, f)

In [None]:
squads_15_16 = {}
for team in all_teams:
  squads_15_16[team] = list(player_df_15["Name"][player_df_15["Club"] == team])

with open("squads_15_16.json", "w") as f:
  json.dump(squads_15_16, f)

In [None]:
squads_16_17 = {}
for team in all_teams:
  squads_16_17[team] = list(player_df_16["Name"][player_df_16["Club"] == team])

with open("squads_16_17.json", "w") as f:
  json.dump(squads_16_17, f)

In [None]:
squads_17_18 = {}
for team in all_teams:
  squads_17_18[team] = list(player_df_17["Name"][player_df_17["Club"] == team])

with open("squads_17_18.json", "w") as f:
  json.dump(squads_17_18, f)

In [None]:
with open("squads_13_14.json", "r") as f:
  squads_13_14 = json.load(f)

with open("squads_14_15.json", "r") as f:
  squads_14_15 = json.load(f)

with open("squads_15_16.json", "r") as f:
  squads_15_16 = json.load(f)

with open("squads_16_17.json", "r") as f:
  squads_16_17 = json.load(f)

with open("squads_17_18.json", "r") as f:
  squads_17_18 = json.load(f)

In [None]:
!pip install stanza

In [None]:
# reference https://stanfordnlp.github.io/CoreNLP/
# accessed 06/04/2023
# the Stanford CoreNLP client is used for OpenIE and parse tree extractions
import os
os.environ["CORENLP_HOME"] = "/content/drive/MyDrive/Colab Notebooks/nlp/stanford-corenlp-full-2018-10-05"

In [None]:
client = CoreNLPClient(timeout=150000000, be_quiet=True, annotators=['openie', 'parse', 'tokenize', 'pos', 'lemma', 'parse', 'depparse'], 
endpoint='http://localhost:9002')
client.start()

In [None]:
# reference, adapted from https://github.com/rahulkg31/sentence-to-clauses
# accessed 06/04/2023
# these functions print phrases from a parse tree generated by the CoreNLP client

def get_verb_phrases(t):
    verb_phrases = []
    num_children = len(t)
    num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))

    if t.label() != "VP":
        for i in range(0, num_children):
            if t[i].height() > 2:
                verb_phrases.extend(get_verb_phrases(t[i]))
    elif t.label() == "VP" and num_VP > 1:
        for i in range(0, num_children):
            if t[i].label() == "VP":
                if t[i].height() > 2:
                    verb_phrases.extend(get_verb_phrases(t[i]))
    else:
        verb_phrases.append(' '.join(t.leaves()))

    return verb_phrases


def get_pos(t):
    vp_pos = []
    sub_conj_pos = []
    num_children = len(t)
    children = [t[i].label() for i in range(0, num_children)]

    flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))

    if "VP" in children and not flag:
        for i in range(0, num_children):
            if t[i].label() == "VP":
                vp_pos.append(t[i].treeposition())
    elif not "VP" in children and not flag:
        for i in range(0, num_children):
            if t[i].height() > 2:
                temp1, temp2 = get_pos(t[i])
                vp_pos.extend(temp1)
                sub_conj_pos.extend(temp2)
    # comment this "else" part, if want to include subordinating conjunctions
    else:
        for i in range(0, num_children):
            if t[i].label() in ["S", "SBAR", "SBARQ", "SINV", "SQ"]:
                temp1, temp2 = get_pos(t[i])
                vp_pos.extend(temp1)
                sub_conj_pos.extend(temp2)
            else:
                sub_conj_pos.append(t[i].treeposition())

    return (vp_pos, sub_conj_pos)


def print_clauses(parse_str):
    sent_tree = ParentedTree.fromstring(parse_str)
    clause_level_list = ["S", "SBAR", "SBARQ", "SINV", "SQ"]
    clause_list = []
    sub_trees = []

    # break the tree into subtrees of clauses using
    # clause levels "S","SBAR","SBARQ","SINV","SQ"
    for sub_tree in reversed(list(sent_tree.subtrees())):
        if sub_tree.label() in clause_level_list:
            if sub_tree.parent().label() in clause_level_list:
                continue

            if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
                    and not sub_tree.parent().label() in clause_level_list):
                continue

            sub_trees.append(sub_tree)
            del sent_tree[sub_tree.treeposition()]

    # for each clause level subtree, extract relevant simple sentence
    for t in sub_trees:
        # get verb phrases from the new modified tree
        verb_phrases = get_verb_phrases(t)

        # get tree without verb phrases (mainly subject)
        # remove subordinating conjunctions
        vp_pos, sub_conj_pos = get_pos(t)
        for i in reversed(vp_pos):
            del t[i]
        for i in reversed(sub_conj_pos):
            del t[i]

        subject_phrase = ' '.join(t.leaves())

        # update the clause_list
        for i in verb_phrases:
            clause_list.append(subject_phrase + " " + i)

    print(clause_list)
    return clause_list

In [None]:
# parse tree phrase extractions

clauses_per_game = []
for text in df["Commentary"]:
  document = client.annotate(text, output_format='json')
  curr_clauses = []
  parse_trees = []
  for sentence in document['sentences']:
      parse_tree = sentence['parse']
      parse_tree = ' '.join(parse_tree.split())
      parse_trees.append(parse_tree)
  
  for parse_tree in parse_trees:
    clause_list = print_clauses(parse_tree)
    for clause in clause_list:
      curr_clauses.append(clause)

  clauses_per_game.append(curr_clauses)

with open("clause-extractions", "wb") as f:
  pickle.dump(clauses_per_game, f)

In [None]:
# Stanford OpenIE extractions

tuples_per_game = []
for text in df["Commentary"]:
  document = client.annotate(text, output_format='json')
  triples = []
  for sentence in document['sentences']:
      for triple in sentence['openie']:
          triples.append(f"{triple['subject']} {triple['relation']} {triple['object']}")
    
  tuples_per_game.append(triples)

In [None]:
# ClausIE extractions
# reference, adapted from https://github.com/mmxgn/spacy-clausie
# accessed 06/04/2023
# this module implements the ClausIE algorithm (https://resources.mpi-inf.mpg.de/d5/clausie/clausie-www13.pdf) in Python

python -m pip install git+https://github.com/mmxgn/spacy-clausie.git
!python -m spacy download en_core_web_sm

import spacy
import claucy

clauses_per_game = []
nlp = spacy.load("en_core_web_sm")
claucy.add_to_pipe(nlp)

for text in df["Commentary"]:
  doc = nlp(text)
  clause_list = []
  for clause in doc._.clauses:
    props = clause.to_propositions(inflect=None)
    for tup in props:
      clause_list.append(" ".join([str(x) for x in tup]))

  clauses_per_game.append(clause_list)

with open("processed_data/spacy-clausie-extractions", "wb") as f:
  pickle.dump(clauses_per_game, f)

In [None]:
with open("processed_data/season_to_team_wordlist.json", "r") as f:
  season_to_team_wordlist = json.load(f)

with open("processed_data/extractions/clause-extractions", "rb") as f:
  clauses_per_game = pickle.load(f)

with open("processed_data/extractions/openie-extractions", "rb") as f:
  tuples_per_game = pickle.load(f)

with open("processed_data/extractions/spacy-clausie-extractions", "rb") as f:
  spacy_clauses_per_game= pickle.load(f)

In [None]:
!pip install unidecode
import unidecode

In [None]:
# replace accented characters with non-accented

from unidecode import unidecode
from nltk.tokenize import word_tokenize
nltk.download('punkt')

unique_words = set()
for text in df["Commentary"]:
  proc_text = word_tokenize(text)
  for word in proc_text:
    u = unidecode(word.lower(), "utf-8")
    unique_words.add(unidecode(u))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
word_to_idx = {w:i for i,w in enumerate(unique_words)}
with open("processed_data/word_to_idx", "wb") as f:
  pickle.dump(word_to_idx, f)

In [None]:
with open("processed_data/word_to_idx", "rb") as f:
  word_to_idx = pickle.load(f)

In [None]:
def probability_from_wordlist(sentence, key_terms):
  # sentence should be a set of lower case words
  return sum([(w in key_terms) for w in sentence])/len(sentence)


def allocate_sentence_to_team(sentence, home, away, team_wordlist):
  home_prob = probability_from_wordlist(sentence, [x.lower() for x in team_wordlist[home]])
  away_prob = probability_from_wordlist(sentence, [x.lower() for x in team_wordlist[away]])
  res = None
  if home_prob > away_prob:
    res = home
  elif away_prob > home_prob:
    res = away
  
  return res

In [None]:
# count vectorizer (bag of words)
cv = CountVectorizer(strip_accents='unicode', stop_words=nltk.corpus.stopwords.words('english'), 
                              tokenizer = word_tokenize, vocabulary=word_to_idx)

def process_sentences(home, away, sentence_list, team_wordlist, alpha=1, l=7576):
  home_vectors = []
  away_vectors = []
  for s in sentence_list:
    proc_text = word_tokenize(s)
    new_words = []
    for word in proc_text:
      u = unidecode(word.lower(), "utf-8")
      new_words.append(unidecode(u))
    
    allocation = allocate_sentence_to_team(new_words, home, away, team_wordlist)    
    vector = cv.fit_transform([" ".join(new_words)]).toarray()
    if not l:
      l = vector.shape[1]
    vector = vector.reshape((1,l))
    
    if not allocation or allocation==home:
      home_vectors.append(vector)
    if not allocation or allocation==away:
      away_vectors.append(vector)
  
  res = np.zeros((2, l)) # res[0] = home team vector, res[1] = away team vector

  # empty concatenate argument throws error so add zero vector
  home_vectors.append(np.zeros((1,l)))
  away_vectors.append(np.zeros((1,l)))
  res[0] = alpha*np.sum(np.concatenate(home_vectors), axis=0)
  res[1] = np.sum(np.concatenate(away_vectors), axis=0)

  return res


def process_match(idx, df, all_tuples, season_to_team_wordlist):
  home, away, season = df["HomeTeam"].iloc[idx], df["AwayTeam"].iloc[idx], df["Season"].iloc[idx]
  vectors = process_sentences(home, away, all_tuples[idx], season_to_team_wordlist[season])
  return vectors

In [None]:
# bag-of-words vectors for ClausIE
spacy_clausie_vectors = []
for i in range(len(df)):
  vectors = process_match(i, df, spacy_clauses_per_game, season_to_team_wordlist)
  spacy_clausie_vectors.append(vectors)

new_clausie = [x.tolist() for x in spacy_clausie_vectors]
with open("processed_data/text_vectors/clausie_vectors_per_game", "wb") as f:
  pickle.dump(new_clausie, f)

In [None]:
# bag-of-words vectors for OpenIE
stanford_openie_vectors = []
for i in range(len(df)):
  vectors = process_match(i, df, tuples_per_game, season_to_team_wordlist)
  stanford_openie_vectors.append(vectors)

new_openie = [x.tolist() for x in stanford_openie_vectors]
with open("processed_data/text_vectors/openie_vectors_per_game", "wb") as f:
  pickle.dump(new_openie, f)

In [None]:
# bag-of-words vectors for parse tree clauses
stanford_tree_vectors = []
for i in range(len(df)):
  vectors = process_match(i, df, clauses_per_game, season_to_team_wordlist)
  stanford_tree_vectors.append(vectors)

new_tree = [x.tolist() for x in stanford_tree_vectors]
with open("processed_data/text_vectors/parse_tree_vectors_per_game", "wb") as f:
  pickle.dump(new_tree, f)

In [None]:
# creating dataframe of extracted segment to team allocation
new_header = df.columns.values.tolist()+["Sentence", "Allocation"]

In [None]:
# OpenIE

new_rows = []
for i, row in df.iterrows():
  sentence_list = tuples_per_game[i]
  home, away, season = row["HomeTeam"], row["AwayTeam"], row["Season"]
  for s in sentence_list:
    proc_text = word_tokenize(s)
    new_words = []
    for word in proc_text:
      u = unidecode(word.lower(), "utf-8")
      new_words.append(unidecode(u))
    
    allocation = allocate_sentence_to_team(new_words, home, away, season_to_team_wordlist[season])   
    sentence = " ".join(new_words)
    new_rows.append(list(row)+[sentence, allocation])

pd.DataFrame(data=new_rows, columns=new_header).to_csv("openie_allocations.tsv", sep='\t')

In [None]:
# Parse tree

new_rows = []
for i, row in df.iterrows():
  sentence_list = clauses_per_game[i]
  home, away, season = row["HomeTeam"], row["AwayTeam"], row["Season"]
  for s in sentence_list:
    proc_text = word_tokenize(s)
    new_words = []
    for word in proc_text:
      u = unidecode(word.lower(), "utf-8")
      new_words.append(unidecode(u))
    
    allocation = allocate_sentence_to_team(new_words, home, away, season_to_team_wordlist[season])   
    sentence = " ".join(new_words)
    new_rows.append(list(row)+[sentence, allocation])

pd.DataFrame(data=new_rows, columns=new_header).to_csv("parse_tree_clause_allocations.tsv", sep='\t')

In [None]:
# ClausIE

new_rows = []
for i, row in df.iterrows():
  sentence_list = spacy_clauses_per_game[i]
  home, away, season = row["HomeTeam"], row["AwayTeam"], row["Season"]
  for s in sentence_list:
    proc_text = word_tokenize(s)
    new_words = []
    for word in proc_text:
      u = unidecode(word.lower(), "utf-8")
      new_words.append(unidecode(u))
    
    allocation = allocate_sentence_to_team(new_words, home, away, season_to_team_wordlist[season])   
    sentence = " ".join(new_words)
    new_rows.append(list(row)+[sentence, allocation])

pd.DataFrame(data=new_rows, columns=new_header).to_csv("clausie_allocations.tsv", sep='\t')

In [None]:
# dataframe of aggregated parse tree extractions

new_rows = []
for i, row in df.iterrows():
  sentence_list = clauses_per_game[i]
  home, away, season = row["HomeTeam"], row["AwayTeam"], row["Season"]
  home_sentences = []
  away_sentences = []
  for s in sentence_list:
    proc_text = word_tokenize(s)
    new_words = []
    for word in proc_text:
      u = unidecode(word.lower(), "utf-8")
      new_words.append(unidecode(u))
    
    allocation = allocate_sentence_to_team(new_words, home, away, season_to_team_wordlist[season])   
    sentence = " ".join(new_words)
    if allocation == home:
      home_sentences.append(sentence)
    if allocation == away:
      away_sentences.append(sentence)
    if allocation is None:
      home_sentences.append(sentence)
      away_sentences.append(sentence)

  new_rows.append(list(row)+[" ".join(home_sentences), home])
  new_rows.append(list(row)+[" ".join(away_sentences), away])

pd.DataFrame(data=new_rows, columns=new_header).to_csv("parse_tree_clause_allocations_aggregated.tsv", sep='\t')

In [None]:
# dataframe of aggregated clausie extractions

clausie_allocations = pd.read_csv('processed_data/extractions/clausie_allocations.tsv', sep='\t').drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
clausie_allocations = clausie_allocations.set_index(["Season", "HomeTeam", "AwayTeam"])
games = clausie_allocations.index.unique()

final = []
for game in games:
  season, home, away = game
  home_sentences = []
  away_sentences = []
  rows = clausie_allocations.loc[game]
  for i, row in rows.iterrows():
    if row["Allocation"] == home:
      home_sentences.append(row["Sentence"])
    elif row["Allocation"] == away:
      away_sentences.append(row["Sentence"])
    else:
      home_sentences.append(row["Sentence"])
      away_sentences.append(row["Sentence"])

  final.append([season, home, away, ' '.join(home_sentences), home])
  final.append([season, home, away, ' '.join(away_sentences), away])

df = pd.DataFrame(final, columns=["Season", "HomeTeam", "AwayTeam", "Sentence", "Allocation"])
df.to_csv('processed_data/extractions/clausie_allocations_aggregated.tsv', sep='\t')

  rows = clausie_allocations.loc[game]


In [None]:
# dataframe of aggreated openie extractions

openie_allocations = pd.read_csv('processed_data/extractions/openie_allocations.tsv', sep='\t').drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
openie_allocations = openie_allocations.set_index(["Season", "HomeTeam", "AwayTeam"])
games = openie_allocations.index.unique()
final = []
for game in games:
  season, home, away = game
  home_sentences = []
  away_sentences = []
  rows = openie_allocations.loc[game]
  for i, row in rows.iterrows():
    if row["Allocation"] == home:
      home_sentences.append(row["Sentence"])
    elif row["Allocation"] == away:
      away_sentences.append(row["Sentence"])
    else:
      home_sentences.append(row["Sentence"])
      away_sentences.append(row["Sentence"])

  final.append([season, home, away, ' '.join(home_sentences), home])
  final.append([season, home, away, ' '.join(away_sentences), away])

df = pd.DataFrame(final, columns=["Season", "HomeTeam", "AwayTeam", "Sentence", "Allocation"])
df.to_csv('processed_data/extractions/openie_allocations_aggregated.tsv', sep='\t')

  rows = openie_allocations.loc[game]
