## Question 3: Natural Language Processing

In [11]:
corpuses = [
"nlp/01-introduction.csv",
"nlp/02-data-exploration.csv",
"nlp/03-decision-trees.csv",
"nlp/04-regression.csv",
"nlp/05-support-vector-machines.csv",
"nlp/06-neural-networks-1.csv",
"nlp/07-neural-networks-2.csv",
"nlp/08-evaluation.csv",
"nlp/09-clustering.csv",
"nlp/10-frequent-itemsets.csv",
]

In [None]:
# Load the dataset
import pandas as pd
from pathlib import Path

dfs = []
for csv_path in corpuses:
    csv_file = Path(csv_path)
    if not csv_file.exists():
        # fallback if the notebook is executed from the repository root
        csv_file = Path("files") / csv_file

    lecture = csv_file.stem  # e.g., 01-introduction
    part = pd.read_csv(csv_file)
    part["lecture"] = lecture
    dfs.append(part)

df = pd.concat(dfs, ignore_index=True)
df = df[["lecture", "start", "end", "text"]]

print("Full dataframe shape (before tokenization):", df.shape)
df.head()

### a)

In [13]:
from collections import Counter
import plotly.express as px

# a) 25 most frequent words (whitespace split, no preprocessing)
all_words = [w for txt in df["text"].astype(str) for w in txt.split()]
word_counts = Counter(all_words)

top25_words = word_counts.most_common(25)
top25_df = pd.DataFrame(top25_words, columns=["word", "count"])

fig = px.bar(
    top25_df,
    x="word",
    y="count",
    title="Top 25 most frequent words (whitespace split, no preprocessing)",
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

print("Two problems with this approach (visible in the histogram):")
print(
    "1) Stopwords/filler words dominate (e.g., 'the', 'and'), making the plot less informative. "
    "Fix: remove stopwords (and optionally filler words)."
)
print(
    "2) Casing and punctuation create duplicates (e.g., 'Data' vs 'data', 'data,' vs 'data'). "
    "Fix: lowercase text and remove punctuation before tokenization."
)


### b)

In [None]:
import nltk
from nltk.corpus import stopwords
import string
import re
from collections import Counter
import plotly.express as px

# Download required NLTK resources if needed (punkt_tab is required by the task)
for resource, path in [
    ("punkt_tab", "tokenizers/punkt_tab"),
    ("punkt", "tokenizers/punkt"),
    ("stopwords", "corpora/stopwords"),
]:
    try:
        nltk.data.find(path)
    except LookupError:
        nltk.download(resource, quiet=True)

stop_words = set(stopwords.words("english"))
punct_table = str.maketrans({p: " " for p in string.punctuation})

def preprocess_and_tokenize(text: str) -> list[str]:
    """Lowercase, remove punctuation, and tokenize using NLTK (punkt_tab/punkt)."""
    if text is None:
        return []
    text = str(text).lower()
    text = text.translate(punct_table)
    text = re.sub(r"\s+", " ", text).strip()
    return [t for t in nltk.word_tokenize(text) if t]

# Add tokenized text column
df["tokenized_text"] = df["text"].apply(preprocess_and_tokenize)
print("Full dataframe shape (after tokenization):", df.shape)
df.head()

# 25 most frequent tokens in 01-introduction (excluding stopwords)
intro_tokens = [
    t
    for toks in df.loc[df["lecture"] == "01-introduction", "tokenized_text"]
    for t in toks
    if t not in stop_words
]
intro_counts = Counter(intro_tokens)
top25_intro = intro_counts.most_common(25)
top25_intro_df = pd.DataFrame(top25_intro, columns=["token", "count"])

fig = px.bar(
    top25_intro_df,
    x="token",
    y="count",
    title="Top 25 most frequent tokens in 01-introduction (no stopwords)",
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


### c)

In [None]:
from IPython.display import display

# c) Stacked histogram for selected tokens, grouped by lecture
selected_tokens = [
    "data",
    "decision",
    "predict",
    "derivative",
    "network",
    "easy",
    "database",
]

rows = []
for lecture, group in df.groupby("lecture"):
    flat = [t for toks in group["tokenized_text"] for t in toks]
    c = Counter(flat)
    for tok in selected_tokens:
        rows.append({"lecture": lecture, "token": tok, "count": c[tok]})

freq_df = pd.DataFrame(rows)

fig = px.bar(
    freq_df,
    x="token",
    y="count",
    color="lecture",
    barmode="stack",
    title="Token frequencies across lectures (stacked by lecture)",
)
fig.show()

# Helper: for each token, show the lecture where it occurs most
top_lectures_per_token = (
    freq_df.loc[freq_df.groupby("token")["count"].idxmax()][["token", "lecture", "count"]]
    .sort_values("token")
    .reset_index(drop=True)
)
display(top_lectures_per_token)

print(
    "Observation: 'data' appears in almost all lectures, while more topic-specific tokens are concentrated in the corresponding lectures "
    "(e.g., 'decision' in decision trees, 'network' in neural networks). "
    "Some terms like 'predict' occur across multiple modeling lectures."
)


### d)

In [15]:
from nltk.util import ngrams
from nltk.probability import ConditionalFreqDist
import random

# d) N-gram language model

def build_ngram_model(n: int, token_sequences) -> ConditionalFreqDist:
    """Builds a ConditionalFreqDist mapping (n-1)-token contexts to next-token counts."""
    cfd = ConditionalFreqDist()
    for tokens in token_sequences:
        padded = ["<s>"] * (n - 1) + list(tokens) + ["</s>"]
        for gram in ngrams(padded, n):
            context = tuple(gram[:-1])
            nxt = gram[-1]
            cfd[context][nxt] += 1
    return cfd


def predict_next_word(model: ConditionalFreqDist, context: tuple[str, ...]) -> str | None:
    """Samples the next word given a context; returns None if context unseen."""
    if context not in model:
        return None

    dist = model[context]
    if len(dist) == 0:
        return None

    candidates = sorted(dist.keys())
    weights = [dist[w] for w in candidates]

    # Required: seed before each random choice for reproducibility
    random.seed(32133)
    return random.choices(candidates, weights=weights, k=1)[0]


def generate_text(
    seed_text: str,
    n: int,
    model: ConditionalFreqDist,
    max_new_tokens: int = 30,
) -> list[str]:
    """Generates up to max_new_tokens after the seed text."""
    seed_tokens = preprocess_and_tokenize(seed_text)
    history = ["<s>"] * max(0, (n - 1) - len(seed_tokens)) + seed_tokens.copy()

    generated = []
    for _ in range(max_new_tokens):
        context = tuple(history[-(n - 1) :]) if n > 1 else tuple()
        next_word = predict_next_word(model, context)
        if next_word is None or next_word == "</s>":
            break
        generated.append(next_word)
        history.append(next_word)

    return seed_tokens + generated


seed = "introduction to data"
for n in [3, 4, 5, 24]:
    model = build_ngram_model(n, df["tokenized_text"])
    generated_tokens = generate_text(seed, n, model, max_new_tokens=30)
    print(f"n={n}:", " ".join(generated_tokens))

print(
    "\nComment: For smaller n (3â€“5), the model has many seen contexts, so it tends to generate longer (but sometimes less coherent) text. "
    "For large n (e.g., 24), contexts become very specific and often unseen, so generation frequently stops early due to sparsity. "
    "For n > 5 we generally expect more sparsity (fewer observed contexts), leading to shorter generations or verbatim continuation of very frequent phrases." 
)


### e)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
from IPython.display import display

# e) Hierarchical TF-IDF timestamp retrieval

stemmer = SnowballStemmer("english")

def tfidf_analyzer(doc: str) -> list[str]:
    """Takes a pre-tokenized doc (space-separated), removes stopwords and stems."""
    tokens = doc.split()
    return [stemmer.stem(t) for t in tokens if t and t not in stop_words]


def cosine_scores(doc_matrix, query_vector):
    # With l2-normalized TF-IDF vectors, dot product = cosine similarity
    return doc_matrix.dot(query_vector.T).toarray().ravel()


def lecture_documents(dataframe: pd.DataFrame) -> pd.Series:
    # One document per lecture: concatenate all tokens of its segments
    return dataframe.groupby("lecture")["tokenized_text"].apply(
        lambda rows: " ".join([t for toks in rows for t in toks])
    )


def retrieve_timestamps(query: str, k: int = 2, m: int = 2):
    query_doc = " ".join(preprocess_and_tokenize(query))

    # Level 1: retrieve lectures
    lec_docs = lecture_documents(df)
    vec_lecture = TfidfVectorizer(analyzer=tfidf_analyzer)
    X_lecture = vec_lecture.fit_transform(lec_docs.values)
    q_lecture = vec_lecture.transform([query_doc])
    lec_scores = cosine_scores(X_lecture, q_lecture)

    top_lec_idx = lec_scores.argsort()[::-1][:k]
    results = []

    for idx in top_lec_idx:
        lecture_name = lec_docs.index[idx]
        lecture_score = float(lec_scores[idx])

        # Level 2: retrieve segments within this lecture
        segs = df.loc[df["lecture"] == lecture_name].reset_index(drop=True)
        seg_docs = segs["tokenized_text"].apply(lambda toks: " ".join(toks)).tolist()

        vec_seg = TfidfVectorizer(analyzer=tfidf_analyzer)
        X_seg = vec_seg.fit_transform(seg_docs)
        q_seg = vec_seg.transform([query_doc])
        seg_scores = cosine_scores(X_seg, q_seg)

        top_seg_idx = seg_scores.argsort()[::-1][:m]
        timestamps = []
        for j in top_seg_idx:
            timestamps.append(
                {
                    "start": float(segs.loc[j, "start"]),
                    "end": float(segs.loc[j, "end"]),
                    "score": float(seg_scores[j]),
                    "text": segs.loc[j, "text"],
                }
            )

        results.append(
            {
                "lecture": lecture_name,
                "score": lecture_score,
                "timestamps": timestamps,
            }
        )

    return results


query_comments = {
    "gradient descent approach": (
        "Expect hits in optimization/backprop sections; top segments should mention gradients, step sizes, or loss minimization."
    ),
    "beer and diapers": (
        "Classic market-basket example; expect the frequent itemsets/association rules lecture, with segments referencing support/confidence."
    ),
}

queries = ["gradient descent approach", "beer and diapers"]
for q in queries:
    res = retrieve_timestamps(q, k=2, m=2)
    print(f"Query: {q}")
    for r in res:
        print(f"- {r['lecture']} (lecture_score={r['score']:.4f})")
        for ts in r["timestamps"]:
            print(
                f"  - [{ts['start']:.2f}, {ts['end']:.2f}] seg_score={ts['score']:.4f}: {ts['text'][:140]}..."
            )

    print("Comment:", query_comments.get(q, "Segments should reflect the query terms."))
