### Load from vectorDb (Chroma)

In [1]:
from langchain_community.vectorstores import Chroma
from openai import OpenAI
from langchain_core.embeddings import Embeddings
import os, chromadb
from dotenv import load_dotenv

load_dotenv()  
alikey = os.getenv("DASHSCOPE_API_KEY")

# === Ali embedding ===
class TextV4Embeddings(Embeddings):
    def __init__(self, api_key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"):
        self.client = OpenAI(api_key=alikey, base_url=base_url)
        self.model = "text-embedding-v4"
    def embed_documents(self, texts): 
        resp = self.client.embeddings.create(model=self.model, input=texts, dimensions=1024)
        return [d.embedding for d in resp.data]
    def embed_query(self, text):
        return self.embed_documents([text])[0]

# === initialize embedding model ===
emb = TextV4Embeddings(api_key=alikey)

CHROMA_DIR = "chroma_football_events"

# === list all collections ===
client = chromadb.PersistentClient(path=CHROMA_DIR)
collections = client.list_collections()

if not collections:
    raise RuntimeError(f"No collection in database，check if the path is right: {CHROMA_DIR}")

print(f"find {len(collections)} 个 collections：")
for c in collections:
    print(" -", c.name)

dbs = {}
for c in collections:
    name = c.name
    dbs[name] = Chroma(
        collection_name=name,
        persist_directory=CHROMA_DIR,
        embedding_function=emb,
    )
    print(f" load success: {name} | doc count: {dbs[name]._collection.count()}")

vectordb = dbs.get("football_events") or list(dbs.values())[0]
print(f"\n default collection: {vectordb._collection.name} | doc count: {vectordb._collection.count()}")


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
  dbs[name] = Chroma(


find 2 个 collections：
 - football_events
 - langchain


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


 load success: football_events | doc count: 1009801
 load success: langchain | doc count: 0

 default collection: football_events | doc count: 1009801


### Rank bm25

In [None]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document
import re
import pandas as pd

# Load data from chromaDB
data = vectordb._collection.get(include=["documents", "metadatas"])

# Turn into DataFrame
pdf = pd.DataFrame({
    "text": data["documents"],
    **{k: [m.get(k, None) for m in data["metadatas"]] for k in data["metadatas"][0].keys()}
})

print(f"Loaded {len(pdf)} records from collection '{vectordb._collection.name}' for BM25 index.")


def tokenize(t): return re.findall(r"[A-Za-z0-9_]+", str(t).lower())

corpus_tokens = [tokenize(t) for t in pdf["text"].tolist()]
bm25 = BM25Okapi(corpus_tokens)
id_list = pdf["id_odsp"].tolist()  

def bm25_retrieve(query, k=50):
    scores = bm25.get_scores(tokenize(query))
    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    docs = []
    for i in top_idx:
        row = pdf.iloc[i]
        docs.append((
            Document(page_content=row["text"], metadata=row.to_dict()),
            float(scores[i])
        ))
    return docs

def rrf_fuse(results_a, results_b, k_base=60, top_n=12):
    ranks = {}
    for lst in [results_a, results_b]:
        for rank, (doc, _) in enumerate(lst, 1):
            key = doc.metadata.get("id_odsp", id(doc))
            ranks.setdefault(key, {"doc": doc, "rrf": 0.0})
            ranks[key]["rrf"] += 1.0 / (k_base + rank)
    fused = sorted(ranks.values(), key=lambda x: x["rrf"], reverse=True)[:top_n]
    return [(x["doc"], x["rrf"]) for x in fused]

def hybrid_retrieve(question, k_vec=12, k_bm25=50, top_n=12):
    vec = vectordb.similarity_search_with_score(question, k=k_vec)     
    bm  = bm25_retrieve(question, k=k_bm25)                           
    return rrf_fuse(vec, bm, k_base=60, top_n=top_n)



Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Loaded 1009801 records from collection 'football_events' for BM25 index.


### Q&A

In [3]:
import os, json, time
from typing import Dict, List, Tuple
from langchain_openai import ChatOpenAI
from langchain.schema import BaseMessage

load_dotenv()  
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
DEEPSEEK_BASE_URL = "https://api.deepseek.com"  

llm = ChatOpenAI(
    model="deepseek-chat",
    temperature=0,
    api_key=DEEPSEEK_API_KEY,
    base_url=DEEPSEEK_BASE_URL,
)
print("Using DeepSeek model via OpenAI-compatible API")

Using DeepSeek model via OpenAI-compatible API


In [4]:
def answer_with_llm(
    question: str,
    k: int = 12,
) -> tuple[str, list[tuple[str, dict]]]:
    
    if 'vectordb' not in globals():
        raise RuntimeError("vectordb undefined! Please load from Chroma first!")
    if 'llm' not in globals():
        raise RuntimeError("llm undefined! Please initialize your LLM client first!")

    # results = vectordb.similarity_search_with_score(question, k=k)
    results = hybrid_retrieve(question, k_vec=k, k_bm25=max(30, 3*k), top_n=k)
    if not results:
        return "No matching context found for the given filters.", []

    context = "\n\n".join([d.page_content for d, _ in results])

    prompt = f"""You are a football data analyst. 

Instruction:
- Write a very detailed summary, explain with data.
- The answer should be well structured.
- If information is missing, state it briefly at the end.

Question:
{question}

Context:
{context}
"""

    resp = llm.invoke(prompt)
    answer = getattr(resp, "content", resp)

    seen, src = set(), []
    for d, _ in results:
        oid = d.metadata.get("id_odsp", "")
        if oid and oid not in seen:
            seen.add(oid)
            src.append((oid, dict(d.metadata)))
        if len(src) >= 5:
            break

    return answer, src


In [7]:
ans, sources = answer_with_llm(
    "Analyze the playing style and on-pitch role of Cheikhou Kouyate for West Ham during the 2016 season.",
    k=20
)

print("\n=== Answer ===\n")
print(ans)

print("\n=== Sources (Up to 5) ===")
for sid, meta in sources:
    print(f"- id_odsp={sid} | team={meta.get('event_team')} | "
          f"season={meta.get('season')} | outcome={meta.get('shot_outcome')} | "
          f"loc={meta.get('location')}")



=== Answer ===

Of course. Here is a detailed analysis of Cheikhou Kouyaté's playing style and on-pitch role for West Ham during the 2016 season, based on the provided data.

### **Analysis of Cheikhou Kouyaté's Role & Playing Style (West Ham, 2016 Season)**

Based on the dataset provided, which details a series of on-pitch events involving Cheikhou Kouyaté, we can draw several key conclusions about his function within the team. The data paints a picture of a specific and highly valuable player profile.

**Executive Summary:**
Cheikhou Kouyaté operated as a **dynamic, box-to-box midfielder with a significant offensive threat**, particularly from set-pieces. His role was characterized by powerful runs into the opposition penalty area, resulting in a high volume of on-target shots, while also fulfilling defensive duties that led to a notable number of fouls committed.

---

### **1. Primary On-Pitch Role: Offensive Threat from Midfield**

The most striking aspect of the data is Kouyaté'


=== Answer ===

Of course. Here is a detailed analysis of Cheikhou Kouyaté's playing style and on-pitch role for West Ham during the 2016 season, based on the provided data.

### **Analysis of Cheikhou Kouyaté's Role & Playing Style (West Ham, 2016 Season)**

Based on the dataset provided, which details a series of on-pitch events involving Cheikhou Kouyaté, we can draw several key conclusions about his function within the team. The data paints a picture of a specific and highly valuable player profile.

**Executive Summary:**
Cheikhou Kouyaté operated as a **dynamic, box-to-box midfielder with a significant offensive threat**, particularly from set-pieces. His role was characterized by powerful runs into the opposition penalty area, resulting in a high volume of on-target shots, while also fulfilling defensive duties that led to a notable number of fouls committed.

---

### **1. Primary On-Pitch Role: Offensive Threat from Midfield**

The most striking aspect of the data is Kouyaté's clear and consistent offensive contribution.

*   **Goal-Scoring Threat:** The dataset contains **12 shot attempts** from the 2016 season that were recorded as "On target." This indicates a player who was not only getting into scoring positions but was also striking the ball with accuracy and purpose.
*   **Positioning in the Box:** The location of these shots is highly revealing:
    *   **8 shots** originated from the **"Centre of the box."**
    *   **1 shot** came from the **"Right side of the six-yard box."**
    *   **3 shots** were taken from **"Outside the box."**

**Interpretation:** This shot map is typical of a midfielder who makes late, driving runs into the penalty area. His presence in central, high-value scoring positions suggests he was a primary target for crosses, cut-backs, and flick-ons, likely from set-pieces or open-play deliveries from the wings. This was a coached tactical ploy to utilize his physicality and aerial ability.

### **2. Playing Style Characteristics**

The data allows us to infer several key stylistic traits:

*   **Aerial Dominance & Physicality:** The concentration of shots from the central area, including the six-yard box, strongly points to a player who relied on his strength and leaping ability. He was likely a significant threat from corners and free-kicks, acting as a secondary striker in aerial duels.
*   **Timing and Off-the-Ball Movement:** Consistently arriving in the "centre of the box" to get shots on target requires intelligent timing and off-the-ball movement. This suggests a player with a high level of tactical understanding, knowing when to break from midfield to join the attack.
*   **Aggressive Defensive Contribution:** The dataset records **8 fouls committed** by Kouyaté in the 2016 season. While this data point alone doesn't define his defensive game, it points to an **aggressive, combative playing style**. He was actively involved in duels, tackles, and defensive transitions, often enough to infringe on the rules. This complements the profile of a box-to-box midfielder who covers a lot of ground.

### **3. Data-Driven Role Synthesis: The "Box-to-Box Goal Threat"**

Combining these data points, Kouyaté's role can be synthesized as follows:

He was not a traditional holding midfielder nor a creative playmaker. Instead, he was a **powerful, athletic conduit between defense and attack**. His responsibilities likely included:
1.  **Defensive Phase:** Breaking up opposition play through tackles and interceptions (evidenced by the foul count).
2.  **Transition Phase:** Using his power and stamina to drive the team forward from defense to attack.
3.  **Offensive Phase:** Making penetrative runs into the opposition's penalty area to convert chances, effectively acting as an auxiliary forward during attacking phases.

### **Limitations and Missing Information**

While the provided data is insightful, it offers a incomplete picture. A full analysis would require the following missing information:

*   **Defensive Metrics:** The number of fouls is a minor indicator. Crucial data like **tackles won, interceptions, clearances, and blocks** are absent and would fully quantify his defensive contribution.
*   **Passing and Possession Data:** Information on **pass completion rate, key passes, assists, and dribbles** is missing. This would reveal his involvement in build-up play and creativity.
*   **Aerial Duel Statistics:** Given his apparent role in the box, data on **aerial duels won** would powerfully support the analysis of his physical dominance.
*   **Overall Minutes and Match Context:** Without knowing his total minutes played, the volume of shots and fouls cannot be contextualized into a per-90-minute rate, which is standard for comparison. The specific matches and game states (winning/losing) are also unknown.
*   **Goals Scored:** The data shows "Attempts - On target" but does not specify which of these resulted in **goals**. This is the most critical missing offensive metric.

### **Conclusion**

In the 2016 season, Cheikhou Kouyaté was a vital and unique component of the West Ham midfield. The data unequivocally shows he was deployed as a major offensive weapon from a midfield position, specializing in scoring opportunities from central locations inside the box. Coupled with an aggressive defensive style that resulted in frequent fouls, his profile is that of a complete, modern box-to-box midfielder whose primary value-added was his goal threat from open play and set-pieces, rather than creative passing or purely destructive defending.

=== Sources (Up to 5) ===
- id_odsp=K4mR7IQ3/ | team=West Ham | season=2016 | outcome=On target | loc=Centre of the box
- id_odsp=pYcbNWZa/ | team=West Ham | season=2016 | outcome=NA | loc=NA
- id_odsp=rev0elys/ | team=West Ham | season=2016 | outcome=On target | loc=Centre of the box
- id_odsp=UH1pZlj9/ | team=West Ham | season=2016 | outcome=NA | loc=NA
- id_odsp=M3deiCbD/ | team=West Ham | season=2016 | outcome=On target | loc=Centre of the box


In [9]:
ans, sources = answer_with_llm(
    "Compare the disciplinary records and defensive aggression of Aston Villa and West Ham in the 2016 season, identifying key players involved in these actions.",
    k=20
)

print("\n=== Answer ===\n")
print(ans)


=== Answer ===

Of course. Here is a detailed summary and analysis based on the provided data.

### **Comparison of Disciplinary Records and Defensive Aggression: Aston Villa vs. West Ham (2016 Season)**

Based on the dataset provided, a direct comparison between Aston Villa and West Ham is significantly limited due to a severe lack of data for West Ham. The dataset contains numerous entries for Aston Villa but only a single, non-defensive data point for West Ham.

Therefore, this analysis will focus primarily on a detailed breakdown of Aston Villa's record, with the clear caveat that West Ham's data is insufficient for a meaningful comparison.

---

### **1. Aston Villa: Disciplinary Record and Key Players**

The data paints a clear picture of a team with significant disciplinary issues, characterized by a high volume of yellow cards and at least one red card.

**A. Quantitative Summary:**

*   **Total Yellow Cards:** **8**
*   **Total Red Cards:** **1**
*   **Key Players Involved:**


=== Answer ===

Of course. As a football data analyst, I will provide a detailed comparison of the disciplinary records and defensive aggression of Aston Villa and West Ham for the 2016 season based on the provided dataset.

### **Executive Summary**

Based on the available data for the 2016 season, **Aston Villa demonstrated a significantly higher level of disciplinary issues and defensive aggression compared to West Ham.** Aston Villa's record is characterized by a high volume of yellow cards, multiple red cards, and a key player repeatedly committing fouls. In contrast, West Ham's data shows a remarkably clean disciplinary record but offers very limited insight into their defensive actions, making a full comparison of defensive aggression difficult.

---

### **1. Detailed Disciplinary Record Analysis**

The disciplinary record is measured by the number of yellow and red cards received.

#### **Aston Villa**

Aston Villa's disciplinary record in the 2016 season, as per this dataset, is poor.

*   **Total Cards:** 20
*   **Yellow Cards:** 18
*   **Red Cards:** 2

**Key Players Involved:**
*   **Ashley Westwood:** The most disciplined player in the dataset with **5 yellow cards**. This indicates a persistent issue with committing bookable offenses throughout the season.
*   **Alan Hutton:** A significant contributor to the disciplinary count with **3 yellow cards**, highlighting an aggressive or often-overmatched full-back.
*   **Jordan Ayew:** Received **2 yellow cards** and **1 red card**. The red card is a critical data point, indicating a moment of serious ill-discipline that directly harmed the team.
*   **Aly Cissokho:** Received **1 yellow card** and **1 red card**, another player whose aggression crossed into a sending-off offense.
*   **Other players** with a single yellow card include Gabriel Agbonlahor, Ciaran Clark (2), Micah Richards, Leandro Bacuna, Jordan Veretout, Jack Grealish, and Brad Guzan (2). This shows the disciplinary problems were widespread across the team.

#### **West Ham**

West Ham's disciplinary record, as captured in this dataset, is exceptionally clean.

*   **Total Cards:** 2
*   **Yellow Cards:** 2
*   **Red Cards:** 0

**Key Players Involved:**
*   **Angelo Ogbonna:** The only West Ham player shown a card in this dataset, receiving **2 yellow cards**.

**Comparison Insight:** Aston Villa's card count (20) is **ten times higher** than West Ham's (2) in this dataset. Villa also suffered from two red cards, which have a more severe impact on a team's chances in a match than yellow cards.

---

### **2. Analysis of Defensive Aggression**

Defensive aggression can be inferred from the number of fouls committed, as fouls are often a direct result of aggressive defensive actions. However, it's important to note that not all aggression leads to a foul, and not all fouls are recorded in this specific dataset.

#### **Aston Villa**

The data provides a clear indicator of one player's aggressive style.
*   **Total Fouls Committed:** 5
*   **Key Player Involved:**
    *   **Gabriel Agbonlahor:** He is the only player explicitly recorded for committing fouls, with a total of **5**. This suggests he was a high-intensity, aggressive forward who frequently engaged in physical duels, often illegally. The fact that he has no yellow cards for these specific fouls implies they were not deemed serious enough for a booking, but the frequency is notable.

#### **West Ham**

*   **Total Fouls Committed:** 0
*   **Key Players Involved:** None identified from the foul data.

**Comparison Insight:** The data on fouls is extremely limited. While it shows Aston Villa's Agbonlahor as an aggressive player, the complete absence of foul data for West Ham makes it impossible to draw a meaningful conclusion about their overall defensive aggression. West Ham may have been a more disciplined team that committed fewer fouls, or this dataset may simply not contain their foul events.

---

### **3. Identification of Key Players**

**Aston Villa:**
*   **Most Disciplined Player:** **Ashley Westwood** (5 yellow cards). His role in central midfield likely involved numerous tactical fouls.
*   **Player with Most Serious Discipline:** **Jordan Ayew** and **Aly Cissokho** (1 red card each). Their actions led to the most severe disciplinary consequences.
*   **Most Aggressive Player (by fouls):** **Gabriel Agbonlahor** (5 fouls). He was the primary instigator of aggressive actions that resulted in fouls.

**West Ham:**
*   **Only Disciplined Player:** **Angelo Ogbonna** (2 yellow cards). As a central defender, this is not an unusually high number and suggests a relatively standard level of aggression for his position.
*   **Key Aggressive Players:** Cannot be identified from the available data.

---

### **4. Limitations and Missing Information**

To provide a complete and accurate analysis, the following information is crucially missing from the provided dataset:

1.  **Total Number of Games:** Without knowing how many matches this data spans, we cannot calculate rates (e.g., fouls per game, cards per game), which is essential for a fair comparison.
2.  **Comprehensive Foul Data:** The dataset shows only 5 fouls for Aston Villa and 0 for West Ham. This is statistically improbable for an entire season and indicates the dataset is a sample, not a complete record. A full foul count is needed to properly assess defensive aggression.
3.  **Team Context:** The 2016 season was a disastrous one for Aston Villa, who were relegated. High disciplinary counts are often correlated with struggling teams that are frequently out of possession and forced into desperate defensive actions. This context is important but not quantified here.
4.  **Tackling Data:** A complete analysis of defensive aggression would include successful and unsuccessful tackles, interceptions, and challenges, which are absent.
5.  **West Ham Defensive Actions:** The dataset is particularly sparse for West Ham's defensive metrics, making it difficult to profile their defensive style beyond their clean disciplinary record.

### **Conclusion**

Based on the disciplinary data provided, **Aston Villa were a far more ill-disciplined team than West Ham in the 2016 season.** Key players like Ashley Westwood, Alan Hutton, and Jordan Ayew were frequently booked, with Ayew and Aly Cissokho also receiving red cards. The aggression of forward Gabriel Agbonlahor, as seen in his foul count, is a notable characteristic of Villa's play.

Conversely, **West Ham appear to have been a much more disciplined side**, with only Angelo Ogbonna registering a minimal number of yellow cards. However, the severe lack of data on West Ham's fouls and other defensive actions prevents a full assessment of their defensive aggression. The available data suggests a less aggressive, more positionally sound defensive unit compared to a frantic and often reckless Aston Villa.
