# mis notas para entender que sucedia

In [None]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [None]:
documents_raw[0]

In [None]:
print(type(documents_raw))

In [None]:
print(type(documents_raw[0]))

In [None]:
import json
print(json.dumps(documents_raw, indent=4))

In [None]:
documents_raw[0]['course']

In [None]:
documents_raw[0]['documents'][0]

In [None]:
course_name_to_find = "machine-learning-zoomcamp"
# O para el ejemplo de la imagen:
# course_name_to_find = "mlops-zoomcamp" # si quieres llegar a la pregunta 'Is it going to be live? When?'

# Paso 1: Encontrar el diccionario del curso usando un bucle for
found_course_data = None
for course_dict in documents_raw:
    if course_dict["course"] == course_name_to_find:
        found_course_data = course_dict
        break # Detener la iteración una vez que encontramos el curso

# Paso 2: Acceder a la primera pregunta si el curso fue encontrado
if found_course_data:
    # Asegurarse de que el curso tiene documentos
    if found_course_data["documents"]:
        first_question = found_course_data["documents"][0]["question"]
        print(f"La primera pregunta para '{course_name_to_find}' es: {first_question}")
    else:
        print(f"El curso '{course_name_to_find}' no tiene documentos.")
else:
    print(f"El curso '{course_name_to_find}' no fue encontrado.")

In [None]:
courses = []

for course in documents_raw:
    courses.append( course['course'])

courses

In [None]:
documents_raw[0]['documents'][0]

In [None]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
documents

# Original Code

## Preprocessing

In [None]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

In [None]:
df.tail()

In [None]:
df[df.course == 'data-engineering-zoomcamp'].head()

In [None]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

## Count Vectorizer

### 🧠 CountVectorizer & Bag of Words (BoW)

`CountVectorizer` from `sklearn.feature_extraction.text` transforms text into a matrix of token counts. It is a practical implementation of the **Bag of Words (BoW)** technique.

#### 📦 What is Bag of Words?

BoW is a text representation method where each document is converted into a vector of word frequencies, ignoring grammar and word order but keeping word occurrence.

- Each row represents a document.
- Each column represents a unique word (token).
- Values indicate how many times each word appears in that document.

#### ⚙️ Example:

```python
from sklearn.feature_extraction.text import CountVectorizer

docs = ["I love data", "I love AI"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
# ['ai' 'data' 'love']

print(X.toarray())
# [[0 1 1]
#  [1 0 1]]


🧠 Note:
This technique does not capture the meaning or order of words, but it’s simple and often effective for traditional machine learning models.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
cv.fit(docs_example)

In [None]:
names = cv.get_feature_names_out()
names

In [None]:
X = cv.transform(docs_example)

In [None]:
X.toarray()

In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

In [None]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

## TfidfVectorizer

### 📊 TfidfVectorizer & TF-IDF

`TfidfVectorizer` from `sklearn.feature_extraction.text` transforms text into a matrix of **TF-IDF scores** (Term Frequency–Inverse Document Frequency). It's a refinement of the **Bag of Words** model that reduces the impact of common words and emphasizes more informative ones.

#### 🧠 What is TF-IDF?

TF-IDF measures how important a word is to a document in a collection (corpus). It balances:
- **Term Frequency (TF):** how often a word appears in a document.
- **Inverse Document Frequency (IDF):** how rare the word is across all documents.

**High TF + Low DF = High importance.**

#### ⚙️ Example:

```python
from sklearn.feature_extraction.text import TfidfVectorizer

docs = ["I love data", "I love AI"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
# ['ai' 'data' 'love']

print(X.toarray())
# Might return values like:
# [[0.0   0.707 0.707]
#  [0.707 0.0   0.707]]
````

#### ℹ️ Interpretation:

* Values near **1** → terms that are unique or highly relevant in that document.
* Values near **0** → common words across the corpus or not present.

#### 🧠 Note:

TF-IDF improves over simple count models by **reducing the weight of frequent but less informative words** (like "the", "is", etc.), often making models more accurate.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)


> ### 🔁 When to use `fit_transform()` vs `transform()`

- **`fit_transform()`**  
  Use this on your **training data**. It:
  1. Learns the vocabulary or statistical parameters (fit).
  2. Applies the transformation to the same data (transform).

  ✅ Applies to the first dataset (e.g., training set).

- **`transform()`**  
  Use this on **new or test data**. It:
  1. Applies the previously learned vocabulary or parameters.

  ✅ Ensures consistency between train and test sets.


#### ⚠️ Example:

```python
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_docs)  # Learn & transform
X_test = vectorizer.transform(test_docs)        # Only transform
````

**Never call `fit()` or `fit_transform()` on test data**, or you'll cause data leakage.


## Do the search with dot product

In [None]:
query = "Do I need to know python to sign up for the January course?"

In [None]:
q = cv.transform([query])
q.toarray()

In [None]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

In [None]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

In [None]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

In [None]:
df_qd

In [None]:
(df_qd['query'] * df_qd['doc']).sum()

In [None]:
X.shape

In [None]:
X.dot(q.T).toarray()

In [None]:
X.dot(q.T).todense()
# doesn't change because there are only 5 elements in the matrix, and everyone has similarity

🔍 Dot Product for Document Ranking

In vector-based search (e.g., with TF-IDF), each document and query are represented as vectors. By computing the **dot product between the query vector and each document vector**, we obtain a relevance score for each document.

- **Higher score → more relevant document**
- This method is efficient and works well with sparse vector representations.
- If vectors are normalized, the dot product equals **cosine similarity**.

The top results are obtained by sorting documents in descending order of their scores.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(X, q)

In [None]:
cosine_similarity(X, q).flatten()

In [None]:
import numpy as np

In [None]:
score = cosine_similarity(X, q).flatten()
np.argsort(score)

In [None]:
query

In [None]:
df.iloc[4].text

In [None]:
df.columns

In [None]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

In [None]:
transformers

In [None]:
matrices

In [None]:
transformers['text'].get_feature_names_out()

In [None]:
matrices['text']

In [None]:
query = "I just singned up. Is it too late to join the course?"

In [None]:
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [None]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score[:10]

In [None]:
import numpy as np

In [None]:
idx = np.argsort(-score)[:10]
idx

In [None]:
score[idx]

In [None]:
df.iloc[idx].text

In [None]:
fields

In [None]:
query = "I just signed up. Is it too late to join the course?"

In [None]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [None]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [None]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

Everything in one class

In [None]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [None]:
fields

In [None]:
index = TextSearch(text_fields=['section', 'question', 'text'])

In [None]:
index.fit(documents)

In [None]:
query

In [None]:
index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

Up to this point the search has been done by words coincidence, and the implementation was released to minsearch library, which is not recommended for projects at scale as it keeps matrices on memory

# Singular Value Decomposition

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
X = matrices['text']
cv = transformers['text']

In [None]:
X

In [None]:
X.shape

In [None]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [None]:
X_emb

In [None]:
X_emb.shape

In [None]:
X_emb[0]

In [None]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)

In [None]:
Q_emb[0]

In [None]:
np.dot(X_emb[0], Q_emb[0])

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()

In [None]:
idx = np.argsort(-score)[:10]

In [None]:
df.loc[idx]
# Here there is no course filter, is just a function to show how the search is done with embbedings

In [None]:
list(df.loc[idx].text)

### 🧮 Singular Value Decomposition (SVD)

**Singular Value Decomposition (SVD)** is a linear algebra technique that factorizes a matrix `A` into three components:

```

A = U Σ Vᵗ

```

- `U`: matrix of left singular vectors (documents)
- `Σ`: diagonal matrix of singular values (importance of concepts)
- `Vᵗ`: matrix of right singular vectors (terms)

SVD is used to **reduce dimensionality** and uncover **latent semantic structures** in high-dimensional data, especially in **text analysis**.

---

#### 📚 Application in Information Retrieval

In IR, SVD is the foundation of **Latent Semantic Indexing (LSI)**, where it helps:

- Identify concepts behind terms
- Capture synonymy and polysemy
- Improve retrieval results via concept-based similarity

---

#### 📄 Historical Origin

SVD itself dates back to early linear algebra work, but its application to IR was popularized by:

> **“Indexing by Latent Semantic Analysis”**  
> *Scott Deerwester, Susan T. Dumais, George W. Furnas, Thomas K. Landauer, Richard Harshman*  
> Journal of the American Society for Information Science, 1990.

This seminal paper introduced **Latent Semantic Indexing (LSI)** using SVD for document retrieval.

---

#### 🧠 Note

Although SVD/LSI was widely used, it's been largely superseded in practice by **neural embeddings** (e.g., Word2Vec, BERT). However, SVD remains a fundamental technique for **understanding dimensionality reduction** and **concept modeling** in text.
```

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

In [None]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

In [None]:
df.loc[idx]

### 🔢 Non-Negative Matrix Factorization (NMF)

**Non-Negative Matrix Factorization (NMF)** is a dimensionality reduction technique that factorizes a non-negative matrix `A` into two lower-rank non-negative matrices:

```

A ≈ W × H

```

- `W`: document-topic matrix
- `H`: topic-term matrix

Unlike SVD, NMF enforces **non-negativity**, leading to **additive and parts-based representations**. This makes the factors more interpretable, especially in **topic modeling** and **text mining**.

---

#### 📚 Use in Information Retrieval and NLP

In text analysis, NMF is commonly applied to **TF-IDF matrices** to extract **latent topics** from documents. Each topic is a weighted combination of terms, and each document is represented as a mixture of topics.

---

#### 📄 Historical Origin

NMF was introduced in its modern form by:

> **“Learning the Parts of Objects by Non-Negative Matrix Factorization”**  
> *Daniel D. Lee and H. Sebastian Seung*  
> *Nature*, 1999

Their work demonstrated how NMF could learn meaningful, part-based features from image and text data.

---

#### 🧠 Note

NMF remains valuable for interpretable topic modeling. However, it has largely been supplanted by **probabilistic models** (like LDA) and **deep learning approaches** (like BERTopic or neural topic models) that capture richer semantic structures.

It's still a great tool when:
- You need **interpretable, non-probabilistic topics**
- You're working with **TF-IDF** representations
- You want a simple and fast alternative to LDA



# BERT

The problem with the previous two approaches is that they don't take into account the word order. They just treat all the words separately (that's why it's called "Bag-of-Words")

BERT and other transformer models don't have this problem.

Let's create embeddings with BERT. We will use the Hugging Face library for that

In [None]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

In [None]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')


In [None]:
encoded_input

In [None]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [None]:
hidden_states

In [None]:
hidden_states.shape

In [None]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

In [None]:
sentence_embeddings.numpy()

# note that if use a GPU, first you need to move your tensors to CPU
# sentence_embeddings_cpu = sentence_embeddings.cpu()

In [None]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [None]:
from tqdm.auto import tqdm

In [None]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [None]:
embeddings = {}

In [None]:
# fields = ['section', 'question', 'text']

for f in fields:
    print(f'computing embeddings for {f}...')
    embeddings[f] = compute_embeddings(df[f].tolist())

In [None]:
import pickle

In [None]:
with open('embeddings.bin', 'wb') as f_out:
    pickle.dump(embeddings, f_out)