In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-4.0.1.tar.gz (434.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.2/434.2 MB[0m [31m15.3 MB/s[0m  [33m0:00:27[0m:00:01[0m00:01[0m
  Installing build dependencies ... [?25done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j==0.10.9.9 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Building wheels for collected packages: pyspark
done
[?25h  Created wheel for pyspark: filename=pyspark-4.0.1-py2.py3-none-any.whl size=434813860 sha256=94933dfd31595098d4c1cf91235cf0faefc4651c0d5586d9266e0c05f8098f56
  Stored in directory: /Users/zhangman/Library/Caches/pip/wheels/00/e3/92/8594f4cee2c9fd4ad82fe85e4bf2559ab8ea84ef19b1dd3d15
Successfully built pyspark
Installing collected packages: py4j, pyspark
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Before run, install pyspark first.

In [None]:
# use Spark to clean/transform/join large datasets, then embed in parallel (careful with model size).
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()
# this is read from hadoop, but can use read from csv first to test
spark = spark.read.csv("hdfs://namenode:8020/path/to/file.csv")
spark.range(5).show()
csv_opts = {
    "header": True,
    "inferSchema": True,
    "mode": "DROPMALFORMED",
    "nullValue": "NA"  # treat "NA" as null
}

df1 = spark.read.options(**csv_opts).csv("/path/to/file1.csv")
df2 = spark.read.options(**csv_opts).csv("/path/to/file2.csv")
df3 = spark.read.options(**csv_opts).csv("/path/to/file3.csv")
# example join on event id (adjust keys to your real ones)
events = df1.unionByName(df2).unionByName(df3)
joined = df_events.join(df_matches, "id_event", "left") \
                  .join(df_players, "player", "left")

#2) Load your code dictionaries (TXT) and map codes → labels
"""
0   Announcement
1   Attempt
2   Corner
...
"""
dict_schema = StructType([
    StructField("code", IntegerType(), True),
    StructField("label", StringType(), True)
])

# If it’s tab-separated:
event_type_dict = spark.read.csv("/path/to/event_type.txt", sep="\t", schema=dict_schema)
# If it’s space-separated, use sep=" " (or read as text and split).

event_type_dict = event_type_dict.withColumnRenamed("label", "event_type_name")

from pyspark.sql.functions import coalesce

events_clean = events.select(
    "id_event", "time", "text", "event_type", "event_type_name",
    "event_team", "opponent", "player", "player2",
    "shot_place", "shot_outcome", "is_goal", "location",
    "bodypart", "assist_method", "situation", "fast_break"
)

# Ensure types / defaults (example)
events_clean = events_clean.withColumn("is_goal", when(col("is_goal") == 1, lit(True)).otherwise(lit(False)))

doc = concat_ws(" ",
    when(col("event_team").isNotNull(), concat_ws("", lit("[Team: "), col("event_team"), lit("]"))).otherwise(lit("")),
    when(col("opponent").isNotNull(), concat_ws("", lit("[Opp: "), col("opponent"), lit("]"))).otherwise(lit("")),
    when(col("event_type_name").isNotNull(), concat_ws("", lit("[Type: "), col("event_type_name"), lit("]"))).otherwise(lit("")),
    when(col("is_goal")==True, lit("[Goal]")).otherwise(lit("")),
    when(col("time").isNotNull(), concat_ws("", lit("[T="), col("time").cast("string"), lit("]"))).otherwise(lit("")),
    col("text")
)
"""
Build a readable document string + metadata for RAG
RAG works best if each row is turned into a self-contained chunk. 
Compose a doc field that mixes your text with important structured fields.
"""
doc = concat_ws(" ",
    when(col("event_team").isNotNull(), concat_ws("", lit("[Team: "), col("event_team"), lit("]"))).otherwise(lit("")),
    when(col("opponent").isNotNull(), concat_ws("", lit("[Opp: "), col("opponent"), lit("]"))).otherwise(lit("")),
    when(col("event_type_name").isNotNull(), concat_ws("", lit("[Type: "), col("event_type_name"), lit("]"))).otherwise(lit("")),
    when(col("is_goal")==True, lit("[Goal]")).otherwise(lit("")),
    when(col("time").isNotNull(), concat_ws("", lit("[T="), col("time").cast("string"), lit("]"))).otherwise(lit("")),
    col("text")
)
# here need to draw graphs to showcase the dataset, also show some generated text
events_docs = events_clean.withColumn("doc", doc)
rows = events_docs.select("id_event","doc","event_team","opponent","event_type_name").limit(50000).collect()
texts = [r["doc"] for r in rows]
metas = [{"id_event": r["id_event"],
          "event_team": r["event_team"],
          "opponent": r["opponent"],
          "event_type": r["event_type_name"]} for r in rows]

In [None]:
pip install langchain langchain-community chromadb sentence-transformers tiktoken
# Optional: OpenAI client if you’ll call an LLM API
pip install openai

In [1]:
#RAG
"""
 RAG (common pattern)

Pipeline overview:
	1.	Ingest: Load raw data (PDFs, MD, CSVs, Parquet, DBs).
	2.	Split: Chunk text (e.g., 500–1000 tokens with overlap).
	3.	Embed: Turn chunks into vectors.
	4.	Index: Store in a vector DB (Chroma/FAISS/Pinecone/Weaviate/Qdrant/Milvus).
	5.	Retrieve: At query time, embed the question → nearest-neighbor search.
	6.	Generate: Feed top-k chunks + question into an LLM with a grounding prompt.
	7.	Evaluate/iterate: Adjust chunk size, rerankers, prompts; add metadata filters.

Popular libraries:
	•	LangChain (chains, retrievers, integrations)
"""
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split into chunks (good for longer doc fields)
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
texts_split, metas_split = [], []
for t, m in zip(texts, metas):
    for chunk in splitter.split_text(t):
        texts_split.append(chunk)
        metas_split.append(m)

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=texts_split,
    embedding=emb,
    metadatas=metas_split,
    collection_name="sports-events"
)

retriever = vectordb.as_retriever(search_kwargs={"k": 5})

q = "Show me attempts by Hamburg that were missed."
docs = retriever.get_relevant_documents(q)
for d in docs[:3]:
    print(d.page_content, d.metadata)

#Using LangChain with RAG and <30B LLMs
# install runtime if needed
!pip install ollama
# run a local model (separate terminal)
ollama run mistral
# Example with a local model via Ollama
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA

llm = ChatOllama(model="mistral")  # or "llama3" etc.
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

print(qa.run("List events where Hamburg had a left-footed shot that missed."))

_IncompleteInputError: incomplete input (199116483.py, line 2)