In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

In [2]:
# setting up URL, request and the scraping for the JSON scripts
url = 'https://understat.com/match/23168'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
scripts = soup.find_all('script')

In [3]:
string_with_json_obj_roster = ''
string_with_json_obj_shots = ''

# Extract relevant script text for rosters and shots
for script in scripts:
    if 'rostersData' in script.text:
        string_with_json_obj_roster = script.text.strip()
    if 'shotsData' in script.text:
        string_with_json_obj_shots = script.text.strip()

# Process and load JSON data for rosters and shots
ind_start = string_with_json_obj_roster.index("('")+2
ind_end = string_with_json_obj_roster.index("')")
json_data = string_with_json_obj_roster[ind_start:ind_end]
rosters_json_data = json_data.encode('utf8').decode('unicode_escape')
ind_start = string_with_json_obj_shots.index("('")+2
ind_end = string_with_json_obj_shots.index("')")
json_data = string_with_json_obj_shots[ind_start:ind_end]
shots_json_data = json_data.encode('utf8').decode('unicode_escape')
rosters_data = json.loads(rosters_json_data)
shots_data = json.loads(shots_json_data)

In [4]:
# function for player data mapping in sentences
def player_data(player_data):
    data = []
    for player_id, player_info in player_data.items():
        player_data = {
            "id": player_info.get("id", None),
            "goals": player_info.get("goals", None),
            "own_goals": player_info.get("own_goals", None),
            "shots": player_info.get("shots", None),
            "xG": player_info.get("xG", None),
            "time": player_info.get("time", None),
            "player_id": player_info.get("player_id", None),
            "team_id": player_info.get("team_id", None),
            "position": player_info.get("position", None),
            "player": player_info.get("player", None),
            "h_a": player_info.get("h_a", None),
            "yellow_card": player_info.get("yellow_card", None),
            "red_card": player_info.get("red_card", None),
            "roster_in": player_info.get("roster_in", None),
            "roster_out": player_info.get("roster_out", None),
            "key_passes": player_info.get("key_passes", None),
            "assists": player_info.get("assists", None),
            "xA": player_info.get("xA", None),
            "xGChain": player_info.get("xGChain", None),
            "xGBuildup": player_info.get("xGBuildup", None),
            "positionOrder": player_info.get("positionOrder", None),
        }
        data.append(player_data)
    return data

home_team_data = player_data(rosters_data["h"])
away_team_data = player_data(rosters_data["a"])

In [5]:
home_team = shots_data['h'][0]['h_team']
away_team = shots_data['a'][0]['a_team']

# function that generates sentences for all player events
def shots_sentences(shots_data):
    sentences = []

    for team in ['h', 'a']:
        for event in shots_data[team]:
            team = event['h_a']
            if team == 'h':
                team_name = home_team
            elif team == 'a':
                team_name = away_team
            sentence = f"{event['player']} in the {event['minute']}th minute: "
            if event['result'] == 'Goal':
                sentence += f"Scored a goal for {team_name}!"
            elif event['result'] == 'MissedShots':
                sentence += f"Missed a shot for {team_name}."
            elif event['result'] == 'BlockedShot':
                sentence += f"Attempted a shot, but it was blocked for {team_name}."
            elif event['result'] == 'SavedShot':
                sentence += f"Shot on target, but saved for {team_name}."
            elif event['result'] == 'ShotOnPost':
                sentence += f"Shot on post, but missed for {team_name}."
        
            sentences.append(sentence)
    home_sentence = f"The home team is {home_team}."
    away_sentence = f"The away team is {away_team}."
    against_sentence = f"{home_team} played against {away_team}"
    date_sentence = f"The game took place on {event['date']}."
    sentences.extend([home_sentence, away_sentence, against_sentence, date_sentence])
    
    return sentences

In [6]:
all_player_sentences = []

# for loop for player sentences
for player in home_team_data + away_team_data:
    team = home_team if player['h_a'] == 'h' else away_team
    sentence_goal = f"{player['player']} has {player['goals']} goals and {player['assists']} assists."
    sentence_own_goal = f"{player['player']} has {player['own_goals']} own goals."
    sentence_shots = f"{player['player']} has {player['shots']} shots."
    sentence_xGoals = f"{player['player']} has {player['xG']} expected goals."
    sentence_minutes = f"{player['player']} has played {player['time']} minutes."
    sentence_position = f"{player['player']} plays as a {player['position']}."
    sentence_y_card = f"{player['player']} has {player['yellow_card']} yellow cards." 
    sentence_r_card = f"{player['player']} has {player['red_card']} red cards."
    sentence_keypasses = f"{player['player']} made {player['key_passes']} keypasses."
    sentence_xAssists = f"{player['player']} has {player['xA']} expected assists."
    sentence_team = f"{player['player']} plays for {team}."
    
    all_player_sentences.extend([sentence_goal, sentence_own_goal, sentence_shots, sentence_xGoals, sentence_minutes, sentence_position, sentence_y_card, sentence_r_card, sentence_keypasses, sentence_xAssists, sentence_team])
    
for sentence in all_player_sentences:
    print(sentence)

Koen Casteels has 0 goals and 0 assists.
Koen Casteels has 0 own goals.
Koen Casteels has 0 shots.
Koen Casteels has 0 expected goals.
Koen Casteels has played 90 minutes.
Koen Casteels plays as a GK.
Koen Casteels has 1 yellow cards.
Koen Casteels has 0 red cards.
Koen Casteels made 0 keypasses.
Koen Casteels has 0 expected assists.
Koen Casteels plays for Wolfsburg.
Cédric Zesiger has 0 goals and 0 assists.
Cédric Zesiger has 0 own goals.
Cédric Zesiger has 1 shots.
Cédric Zesiger has 0.0340309664607048 expected goals.
Cédric Zesiger has played 90 minutes.
Cédric Zesiger plays as a DC.
Cédric Zesiger has 1 yellow cards.
Cédric Zesiger has 0 red cards.
Cédric Zesiger made 0 keypasses.
Cédric Zesiger has 0 expected assists.
Cédric Zesiger plays for Wolfsburg.
Sebastiaan Bornauw has 0 goals and 0 assists.
Sebastiaan Bornauw has 0 own goals.
Sebastiaan Bornauw has 0 shots.
Sebastiaan Bornauw has 0 expected goals.
Sebastiaan Bornauw has played 50 minutes.
Sebastiaan Bornauw plays as a DC.

In [7]:
event_sentences = shots_sentences(shots_data)
for sentence in event_sentences:
    print(sentence)

Lovro Majer in the 5th minute: Attempted a shot, but it was blocked for Wolfsburg.
Jonas Wind in the 5th minute: Attempted a shot, but it was blocked for Wolfsburg.
Mattias Svanberg in the 5th minute: Shot on target, but saved for Wolfsburg.
Jonas Wind in the 8th minute: Scored a goal for Wolfsburg!
Mattias Svanberg in the 43th minute: Missed a shot for Wolfsburg.
Cédric Zesiger in the 48th minute: Missed a shot for Wolfsburg.
Mattias Svanberg in the 52th minute: Attempted a shot, but it was blocked for Wolfsburg.
Václav Cerny in the 61th minute: Missed a shot for Wolfsburg.
Rogerio in the 65th minute: Scored a goal for Wolfsburg!
Kevin Paredes in the 80th minute: Shot on target, but saved for Wolfsburg.
Mohamed Simakan in the 3th minute: Missed a shot for RasenBallsport Leipzig.
Loïs Openda in the 16th minute: Shot on target, but saved for RasenBallsport Leipzig.
Loïs Openda in the 16th minute: Missed a shot for RasenBallsport Leipzig.
Yussuf Poulsen in the 19th minute: Shot on target

In [1]:
import os
import openai
%env OPENAI_API_KEY = Key
# Use of own Openai API Key for ChatGPT usage
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']

env: OPENAI_API_KEY=Key


In [9]:
# connection all sentences and turning them into a DataFrame
liste = all_player_sentences + event_sentences
df = pd.DataFrame(liste)
print(df)

                                                     0
0             Koen Casteels has 0 goals and 0 assists.
1                       Koen Casteels has 0 own goals.
2                           Koen Casteels has 0 shots.
3                  Koen Casteels has 0 expected goals.
4                 Koen Casteels has played 90 minutes.
..                                                 ...
364  Xavi Simons in the 91th minute: Missed a shot ...
365                        The home team is Wolfsburg.
366           The away team is RasenBallsport Leipzig.
367    Wolfsburg played against RasenBallsport Leipzig
368        The game took place on 2023-11-25 14:30:00.

[369 rows x 1 columns]


In [10]:
import tiktoken
from langchain.text_splitter import TokenTextSplitter
# Split text into chunks of 512 tokens, with 20% token overlap
text_splitter = TokenTextSplitter(chunk_size=512,chunk_overlap=103)

In [11]:
# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

new_list = []

# Create a new list by splitting up text into token sizes of around 512 tokens
for i in range(len(df.index)):
    text = df[0][i]
    token_len = num_tokens_from_string(text)
    if token_len <= 512:
        new_list.append([df[0][i]])
    else:
        #split text into chunks using text splitter
        split_text = text_splitter.split_text(text)
        for j in range(len(split_text)):
            new_list.append([split_text[j]])
            

In [12]:
df_new = pd.DataFrame(new_list, columns=['content'])
df_new.head()

Unnamed: 0,content
0,Koen Casteels has 0 goals and 0 assists.
1,Koen Casteels has 0 own goals.
2,Koen Casteels has 0 shots.
3,Koen Casteels has 0 expected goals.
4,Koen Casteels has played 90 minutes.


In [13]:
#load documents from Pandas dataframe for insertion into database
from langchain.document_loaders import DataFrameLoader

# page_content_column is the column name in the dataframe to create embeddings for
loader = DataFrameLoader(df_new, page_content_column = 'content')
docs = loader.load()

In [14]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
# Creating a FAISS vector store from documents using OpenAI embeddings
db = FAISS.from_documents(docs, OpenAIEmbeddings())
# Creating a retriever with search parameters for returning the top 3 results
retriever = db.as_retriever(
    search_kwargs={"k": 3}
    )

In [15]:
from langchain.chat_models import ChatOpenAI
# selecting the LLM from OpenAI
llm = ChatOpenAI(temperature = 0.0, model = 'gpt-3.5-turbo-16k')

In [16]:
from langchain.chains import RetrievalQA
# Creating RetrievalQA system with LangChain using llm, "stuff" chain type, retriever, and verbose mode
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=True,
)

In [17]:
from IPython.display import Markdown, display
# Questions for the query into the RetrievalQA system
querys = ["Who has the highest expected goal value?", "Who is expected to score the most goals?", "What are the informations about Rogerio?", "When was the first action happening in the match?", "When was the first action happening in the game?", "What was the first event happening in the match?", "When was the last action in the game?", "What was the last event of the game?"]

for query in querys:
    response = qa_stuff.run(query)
    display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


Yussuf Poulsen has the highest expected goal value with 1.311048150062561.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


Yussuf Poulsen is expected to score the most goals with an expected goals value of 1.311048150062561.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


Rogerio plays for Wolfsburg as a DML. He has scored 1 goal and has not provided any assists.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The first action in the match occurred in the 5th minute when Mattias Svanberg took a shot on target for Wolfsburg.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The first action in the game occurred in the 16th minute when Loïs Openda had a shot on target, but it was saved by RasenBallsport Leipzig.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The first event in the match was Loïs Openda's shot on target in the 16th minute, which was saved by RasenBallsport Leipzig.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The last action in the game was Rogerio scoring a goal for Wolfsburg in the 65th minute.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The last event of the game was Loïs Openda's shot being blocked for RasenBallsport Leipzig in the 78th minute.