In [1]:
from typing import List, Dict
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## test su come rappresentare i dati 
arsenal_df = pd.read_csv("arsenal_events.csv")
arsenal_df.shape

(31528, 39)

## test su un matcg

In [3]:
first_match = arsenal_df.loc[arsenal_df["match_id"]=="MATCH_1", :]
first_match.columns
first_match.shape

(872, 39)

In [None]:
## obiettivo dell'utilizzo di LLM: 
## 1 ottenere informazioni difficilemente ottenibili dal codice
##  cosa è mancato in questa partita
##  in quali reparti la squadra è carente 
## che caratteristiche deve avere un nuovo attaccante ==> input per un nuovo agente con info storiche su calciatori (es tocchi per partita, distanza percorsa, precisione passaggi)
## con queste caratteristiche in che posizione potrebbe posizionarsi in campionato la squadra? 

## cambiare istouch da true a isTouch, 
## type value potrebbe essere un valore per il tipo di evento
## nuova colonna nome evento + type_value + successfull/unsuccsefful, es pass_35_successfull

## righe simile alla struttura di un KG. 
## es Aubameyanj [position] - pass_35_successfull at minute 38 from position x,y, to position endx,endy. IsGoal/isShot/isPass/isOwnGoal .Match number

## system prompt:
## you are a football mathch analyst expert helping finding key insights from matches events. 
# Your input are strings representing match events in the form: player position - event, associated value, event outcome, minute, position on field. Event details . MAtch number

### pulizia del dataset

In [14]:
from openai import OpenAI
client = OpenAI(
    api_key="LA-1682cbcbdb7748bb8687efd29bbeb27af57f29735bc846c3abe5f79f82f8eab1",
    base_url="https://api.llama-api.com"
)

def api_llm_function(prompt):
    response = client.chat.completions.create(
        model="llama3.3-70b",  
        messages=[
            {"role": "system", "content": "You are a football match analysis expert. Help extracting insights from events of Arsenal matches."},
            {"role": "user", "content": prompt}
            ],
            timeout=None
    )
    return response.choices[0].message.content


## costruzione classe con tutte le funzioni

In [None]:
from faiss import write_index, read_index
import  os
class EventRAG:
    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2",similarity_metric="Euclidean"):
        self.encoder = SentenceTransformer(embedding_model)
        self.df = None
        self.embeddings = None
        self.similarity_metric = similarity_metric
        
    def clean_dataset_RAG(self,data):
    ## colonne da droppare
        data = data.drop(["id","eventId","minute","qualifiers","blockedX","relatedPlayerId","satisfiedEventsTypes","playerId","period_value", "type_value","goalMouthZ", 'goalMouthY','blockedY', 'cardType_value', 'cardType_displayName','shirtNo',"isGoal_bin","isOwnGoal_bin"],axis=1)
        data = data.drop(columns=[col for col in data.columns if "Unnamed" in col])
        data["isGoal"] = data["isGoal"].apply(lambda x: "is a goal" if x is True else x)
        data["isOwnGoal"] = data["isGoal"].apply(lambda x: "is own goal" if x is True else x)

        data = data.loc[data["type_displayName"] != "Start"]
        self.df = data
        
        
    def create_event_context(self,row):
        to_add = ""
        if pd.notna(row["isOwnGoal"]):
            to_add = "This " + row["isOwnGoal"]
        elif pd.notna(row["isGoal"]):
            to_add = "This " + row["isGoal"]

        string_model = f'in {row["match_id"]} at minute {row["expandedMinute"]}:{row["second"]} the player {row["name"]} in position {row["position"]} made {row["outcomeType_displayName"]} {row["type_displayName"]} from position x = {row["x"]},y = {row["y"]} to position x = {row["endX"]},y={row["endY"]}.{to_add} '
        return string_model
    
    def get_embeddings(self):
        event_contexts = [self.create_event_context(row) for _, row in self.df.iterrows()]
        print("obtained textual context")
        # print(event_contexts)
        self.embeddings =  self.encoder.encode(event_contexts)
        print("obtained embeddings")

    def create_vector_db(self):
        dimension = self.embeddings.shape[1]  # Dimension of embeddings
        if self.similarity_metric == "Euclidean":
            self.index = faiss.IndexFlatL2(dimension)  ## L2 Distance => Distanza Euclidea 
            self.index.add(self.embeddings)
            print("created vector_db")
            faiss.write_index(self.index, "match_index_faiss.idx")
        elif self.similarity_metric == "Cosine":
            self.index = faiss.IndexFlatIP(dimension)  ## L2 Distance => Distanza Euclidea 
            self.index.add(self.embeddings)
            print("created vector_db")
            faiss.write_index(self.index, "Cosine_match_index_faiss.idx")
         

    def retrieve_similar_events(self,user_query, top_k=5,force_rebuid=False):

        
        ## check se esiste vector db
        if force_rebuid:
            self.create_vector_db()
        vector_file = [file for file,idx in zip(os.listdir(),[".idx" in i for i in os.listdir()]) if idx]
        vector_file = [file for file in vector_file if str(self.similarity_metric) in file]

        if vector_file:
            print("found existing index: ", vector_file)
            if len(vector_file) > 1:
                print("warning: found two existing indexes. Using this one: ", vector_file[0])
            self.index = faiss.read_index(vector_file[0])
            print("loaded existing index")
        else:
            print("creating new index")
            self.create_vector_db()
            
            

        query_embedding = self.encoder.encode([user_query], convert_to_numpy=True)
        print("embedded query")
        # Search for closest matches in FAISS
        distances, indices = self.index.search(query_embedding, top_k)
        # print("Distances, ", distances, "\n indices", indices[0])
        # results = df.iloc[indices[0]]
        # return results[["text_description", "minute", "player", "event_type"]]
        relevant_events = []
        for idx in indices[0]:
            event_data = self.df.iloc[idx].to_dict()
            relevant_events.append({
                'event_data': event_data,
                'event_context': self.create_event_context(self.df.iloc[idx])
            })
                
        return relevant_events

    def generate_response(self,query: str, llm_function,top_k=5, similarity_metric = "Euclidean",force_rebuild = False) -> str:
        """
        Generate a response using the LLM with retrieved context
        
        Parameters:
        - query: User query
        - llm_function: Function that takes a prompt and returns LLM response
        - top_k: Number of relevant events to retrieve
        """
        # Retrieve relevant events
        if similarity_metric:
            self.similarity_metric = similarity_metric
        relevant_events = self.retrieve_similar_events(query, top_k,force_rebuid=force_rebuild)
        context_str = ""
        for idx, event in enumerate(relevant_events, 1): ## 1 = start enumerating from 1
            # context_str += f"\n{idx}. Event Details:\n"
            context_str += event['event_context'] + "\n"
        # print("context str= ", context_str)
        # # Create prompt with context
        prompt = f"""Considering these match events:
                    {context_str}

                    Question: {query}."""
        
        return llm_function(prompt)

    def add_new_event(self, event_data: Dict):
        """
        Add a new event to the dataset and update embeddings
        
        Parameters:
        - event_data: Dictionary containing event information matching CSV columns
        """
        # Add new event to dataframe
        new_row = pd.DataFrame([event_data])
        self.df = pd.concat([self.df, new_row], ignore_index=True)
        
        # Update embeddings
        new_context = self.create_event_context(pd.Series(event_data))
        new_embedding = self.encoder.encode([new_context])
        
        if self.embeddings is None:
            self.embeddings = new_embedding
        else:
            self.embeddings = np.vstack([self.embeddings, new_embedding])

In [None]:
import faiss
rag = EventRAG()

# Load your CSV data
# first_match
rag.clean_dataset_RAG(first_match)

# Compute embeddings for all events
rag.get_embeddings()


obtained textual context
obtained embeddings


In [None]:
response = rag.generate_response(
    "Who was a key player in this match?",
    api_llm_function
)

created vector_db


In [None]:
print(response)

Based on the provided match events, it appears that Mohamed Elneny was a key player in this match. He made multiple successful passes throughout the game, showcasing his involvement and contribution to the team's performance.

Here are some insights that support this conclusion:

1. **Frequency of successful passes**: Mohamed Elneny made 5 successful passes in the match, which indicates that he was actively participating in the game and creating opportunities for his team.
2. **Variety of passing locations**: The passes were made from different locations on the field, such as x = 24.3, 46.9, 37.3, 62.2, and 32.3, which suggests that Elneny was moving around the field and adapting to different situations.
3. **Consistency of passing**: The successful passes were made at different times during the match, including in the 62nd, 68th, and 92nd minutes, which indicates that Elneny was consistent in his performance throughout the game.

Overall, based on the data, Mohamed Elneny appears to h

In [None]:
response = rag.generate_response(
    "Based on this performances. What could be a possible lineup for the next match?",
    api_llm_function
)

found existing index:  ['match_index_faiss.idx']
loaded existing index


In [None]:
print(response)

Based on the provided match events, we can extract some insights about the players' performances:

1. **Alexandre Lacazette**:
	* He played as a Forward (FW) and was involved in multiple successful passes and clearances.
	* His passing range is quite diverse, with successful passes from different areas of the pitch (e.g., x = 87.8, y = 41.7 to x = 85.3, y = 46.6, and x = 33.6, y = 34.9 to x = 22.8, y = 46.0).
	* He also made a successful clearance from a relatively advanced position (x = 6.6, y = 43.7 to x = 20.2, y = 73.2), indicating his willingness to track back and defend.
2. **Ainsley Maitland-Niles**:
	* He played as a Defensive Midfielder/Left Midfielder (DML) and made a successful clearance from a relatively deep position (x = 18.7, y = 93.6 to x = 19.3, y = 96.0).
	* This suggests that he is comfortable playing in a deeper role and is capable of making defensive contributions.

Considering these insights, a possible lineup for the next match could be:

* **Formation:** 4-2-3-1

In [None]:
response = rag.generate_response(
    "Based on thperformances. What could be a possible lineup for the next match?",
    api_llm_function,
    top_k=20
)

found existing index:  ['match_index_faiss.idx']
loaded existing index


In [None]:
print(response)

Based on the performance of Alexandre Lacazette in MATCH_1, here are some insights that could inform the lineup for the next match:

1. **Lacazette's work rate**: Lacazette was involved in 20 events throughout the match, indicating a high level of activity and work rate. This suggests that he could be a key player in the next match.
2. **Passing ability**: Lacazette completed 12 successful passes out of 17 attempts, which is a passing accuracy of 70.6%. This is a respectable passing accuracy, especially considering that he played as a forward.
3. **Clearance ability**: Lacazette made 1 successful clearance, which suggests that he is capable of defending and winning the ball back for his team.
4. **Positioning**: Lacazette's events were spread across the pitch, but he tended to operate in the attacking third, with many of his passes and clearances originating from the right and left wings.

Considering these insights, here is a possible lineup for the next match:

* **Formation**: 4-2-3

In [None]:
response = rag.generate_response(
    "What has been lacking in this match? In which areas the team coul be improved?",
    api_llm_function,
    top_k=50
)

found existing index:  ['match_index_faiss.idx']
loaded existing index


In [None]:
print(response)

Based on the provided match events, here are some insights and potential areas for improvement:

1. **Finishing and Goal Scoring**: Although Alexandre Lacazette scored a goal, there were several instances of unsuccessful shots (e.g., minute 79:34), missed opportunities, and unsuccessful passes in the final third. Improving finishing and goal-scoring abilities could be an area of focus.
2. **Passing Accuracy**: There were several instances of unsuccessful passes, especially in the attacking third (e.g., minutes 50:0, 54:35, 84:22). Working on improving passing accuracy, particularly in critical areas of the pitch, could help create more scoring opportunities.
3. **Aerial Duels**: The team struggled with aerial duels, as evidenced by unsuccessful aerials (e.g., minutes 53:56, 90:48). Improving aerial ability and winning headers could be an area for improvement.
4. **Defensive Solidity**: Although the team had some successful clearances and tackles, there were instances of unsuccessful ch

In [None]:
response = rag.generate_response(
    "Based on these performances. What could be a possible lineup for the next match?",
    api_llm_function,
    top_k=50,
    similarity_metric="Cosine"
)

creating new index
created vector_db
embedded query


In [None]:
print(response)

Based on the provided match events, here's a possible lineup for the next match:

**Formation:** 4-2-3-1

**Starting XI:**

1. **Goalkeeper:** Not specified in the provided events, but assuming the goalkeeper remains the same.
2. **Defenders:**
	* **Left Back:** Ainsley Maitland-Niles (DML) - Had a successful pass and clearance, showing his defensive capabilities.
	* **Center Backs:**
		+ Gabriel Magalhães (DC) - Made multiple successful clearances, demonstrating his defensive skills.
		+ (Assuming another center back, not specified in the events)
	* **Right Back:** Not specified in the provided events, but assuming a suitable player fills this position.
3. **Midfielders:**
	* **Defensive Midfielders:**
		+ Mohamed Elneny (MC) - Had an impressive game with multiple successful passes, showing his ability to control the tempo of the game.
		+ (Assuming another defensive midfielder, not specified in the events)
	* **Attacking Midfielders:**
		+ (Assuming suitable players fill these positi

In [None]:
response = rag.generate_response(
    "What has been lacking in this match? In which areas the team could be improved?",
    api_llm_function,
    top_k=200,
    similarity_metric="Cosine"
)

found existing index:  ['Cosine_match_index_faiss.idx']
loaded existing index
embedded query


In [None]:
print(response)

Based on the provided match events, here are some insights and potential areas for improvement:

1. **Finishing and Goal Scoring**: Although there are several successful passes, tackles, and ball recoveries, the team only scored two goals (one from Alexandre Lacazette and one from Gabriel Magalhães). This suggests that the team might be struggling with converting chances into goals. Improving finishing and goal-scoring abilities could be an area of focus.
2. **Passing Accuracy**: There are several instances of unsuccessful passes, which could indicate a lack of precision and accuracy in the team's passing game. Working on improving passing accuracy, especially in the final third, could help create more scoring opportunities.
3. **Defensive Vulnerabilities**: Although the team has made several successful clearances and tackles, there are also instances of unsuccessful tackles and passes that could have led to opposition counter-attacks. Strengthening the defense and reducing vulnerabili