In [30]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from collections import Counter
from langchain.vectorstores import FAISS
from tqdm import tqdm
import time

In [31]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings(openai_api_key="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")

In [32]:
df = pd.read_csv("cleaned_player_texts.csv")  # Make sure this is your final file

# Step 2: Initialize the character-based text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,      # ≈ 250 tokens
    chunk_overlap=250     # ensures continuity across chunks
)


In [33]:
documents = []
for _, row in df.iterrows():
    player_name = row["player"]
    full_text = row["combined_text"]

    # Split the full text into overlapping chunks
    text_chunks = splitter.split_text(full_text)

    # Create LangChain Document objects with metadata
    for chunk in text_chunks:
        doc = Document(page_content=chunk, metadata={"player": player_name})
        documents.append(doc)

# Step 4: [Optional] Print some results
print(f"✅ Chunking complete. Total chunks: {len(documents)}")
print("Example chunk:")
print(documents[0].page_content)
print("Metadata:", documents[0].metadata)

✅ Chunking complete. Total chunks: 18553
Example chunk:
Alaa Abdelnaby (Arabic: علاء عبد النبي; born June 24, 1968) is an Egyptian-American former professional basketball player. He played college basketball for the Duke Blue Devils followed by a five-year National Basketball Association (NBA) career, and then stints in various other leagues. Abdelnaby is one of two Egyptian-born players in the history of the NBA, along with Abdel Nader. Abdelnaby works as a basketball broadcaster and analyst for NBCS Philadelphia, CBS Sports Network, and Westwood One Radio. Abdelnaby was born in Alexandria, Egypt, and moved to the United States with his family in 1971 at the age of two. His father was an engineer and his mother was a computer analyst who had moved to find better jobs. His family became American citizens. Abelnaby was raised in Nutley and Bloomfield, New Jersey, and played on the Bloomfield High School basketball team. Abdelnaby was selected as a standout American high school athlete a

In [34]:
counter = Counter([doc.metadata["player"] for doc in documents])
print(counter.most_common(5))  # Show top 5

[('LeBron James', 67), ('Kobe Bryant', 59), ('Stephen Curry', 54), ('Michael Jackson', 54), ("Shaquille O'Neal", 49)]


## Embedding and Vector Store Creation

In [35]:
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

# Initialize storage
all_embeddings = []
all_texts = []
all_metadata = []

batch_size = 50  # Safe starting point

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    batch_meta = metadatas[i:i+batch_size]

    try:
        batch_embeddings = embedding.embed_documents(batch_texts)
        all_embeddings.extend(batch_embeddings)
        all_texts.extend(batch_texts)
        all_metadata.extend(batch_meta)
    except Exception as e:
        print(f"Batch {i} failed: {e}")
        time.sleep(60)  # Backoff for rate limiting



100%|██████████| 372/372 [07:28<00:00,  1.21s/it]


In [36]:
text_embedding_pairs = list(zip(all_texts, all_embeddings))
vectorstore = FAISS.from_embeddings(text_embedding_pairs, embedding=embedding, metadatas=all_metadata)

In [37]:
vectorstore.save_local("nba_vector_db")
print("✅ Vector store created and saved successfully.")

✅ Vector store created and saved successfully.
