In [1]:
import pandas as pd
import requests
import json

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/refs/heads/main/notebooks/documents.json"

In [3]:
docs_response = requests.get(url=url).json()
documents = []

In [4]:
for course in docs_response:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
data = pd.DataFrame(data=documents)
data.head()

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


Vector spaces
- turn the docs into vectors
- term-document matrix:
  - rows: documents
  - columns: words/tokens
- bag of words
    - where does the word exist in the sentence is lost 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer(min_df=3, stop_words='english')
cv.fit(data.text)

In [8]:
processed_data = cv.transform(data.text)

In [9]:
df_processed = pd.DataFrame(data=processed_data.todense(), columns=cv.get_feature_names_out())

In [10]:
df_processed.transpose().sample(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
register,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
workarounds,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fails,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
come,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
choco,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
randomly,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dmytro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
isn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
window,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
given,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(stop_words='english', min_df=3)

In [13]:
tfidf.fit(data.text)

In [14]:
processed_data = tfidf.transform(data.text)

In [15]:
df_processed = pd.DataFrame(data=processed_data.todense(), columns=cv.get_feature_names_out())

In [16]:
df_processed.head()

Unnamed: 0,001,01,02,03,04,05,06,06_spark_sql,09,10,...,yesipov,yml,youtube,yyyy,zero,zip,zone,zones,zoom,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.340486
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
query = "Do i need to know python to sign up for a January course?"

processed_query = tfidf.transform([query])

In [18]:
processed_query.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 2118))

In [19]:
q_dict = {i:j.item() for i,j in zip(tfidf.get_feature_names_out(), processed_query.toarray()[0])}

In [20]:
q_dict['python']

0.31441356049301333

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
scores = cosine_similarity(X=processed_data, Y=processed_query).flatten()

In [23]:
import numpy as np

In [24]:
np.argsort(scores)[-5:]

array([764, 398, 806, 577, 445])

In [25]:
for i in np.argsort(scores)[-5:]:
    print(data.text[i]+"\n")

If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.
(optional) David Odimegwu

You need to redefine the python environment variable to that of your user account

Technically, yes. Advisable? Not really. Reasons:
Some homework(s) asks for specific python library versions.
Answers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)
And as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?
You can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.
tx[source]

You can find the intercept between these two curves using numpy

In [26]:
text_columns = [cols for cols in data if 'course' not in cols]
matrices = {}
vectorisors = {}
for fields in text_columns:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(data[fields])
    matrices[fields] = X
    vectorisors[fields] = cv

In [27]:
filters = {
    'course' : 'data-engineering-zoomcamp'
}

boost = {
    'question' : 3,
    'text' : 2
}

In [28]:
score = np.zeros(data.shape[0])
query = "Do i need to know python to sign up for a January course?"
for field in text_columns:
    processed_query = vectorisors[field].transform([query])
    f_score = cosine_similarity(matrices[field], processed_query).flatten()
    for f in filters:
        mask = (data[f] == filters[f]).astype(int)
    importance = boost.get(field, 1.0)
    score = (score + f_score*importance)*mask

In [29]:
score

0      2.006265
1      1.812764
2      1.466764
3      1.609134
4      1.911422
         ...   
943    0.000000
944    0.000000
945    0.000000
946    0.000000
947    0.000000
Name: course, Length: 948, dtype: float64

In [30]:
top_content = np.argsort(score)[-5:]

In [31]:
top_content

943     5
944     4
945     0
946     7
947    18
Name: course, dtype: int64

In [32]:
for i in top_content:
    print(data.iloc[i].question)
    print()
    print(data.iloc[i].text)
    print()
    print()

Course - how many Zoomcamps in a year?

There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:
Data-Engineering (Jan - Apr)
MLOps (May - Aug)
Machine Learning (Sep - Jan)
There's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.
They follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.


Course - What can I do before the course starts?

You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terraform
Git
Look over the prerequisites and syllabus to see if you are comfortable with these subjects.


Course - When will the course start?

The purpose of this document is to capture fr

**Building the Search Service class that represents the Elastic Search**

In [33]:
class SearchService:
    def __init__(self, text_fields: list[str]):
        self.data = pd.DataFrame()
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorisors = {}

    def fit(self, records: dict, vectorisor_params: dict={}):
        self.data = pd.DataFrame(records)
        for fields in self.text_fields:
            cv = TfidfVectorizer(**vectorisor_params)
            X = cv.fit_transform(self.data[fields])
            self.matrices[fields] = X
            self.vectorisors[fields] = cv

    def search_top_results(self,
                           query: str,
                           filters: dict={},
                           boost: dict={},
                           num_results: int=5):

        score = np.zeros(self.data.shape[0])
        for fields in self.text_fields:
            processed_query = self.vectorisors[fields].transform([query])
            f_score = cosine_similarity(self.matrices[fields], processed_query).flatten()
            
            importance = boost.get(fields, 1.0)
            score = score + f_score*importance
        for f in filters:
            mask = (self.data[f] == filters[f]).astype(int)
            score = score*mask

        top_indexes = np.argsort(score)[-num_results:]
        results = self.data.iloc[top_indexes]
        return results.to_dict(orient='records') 

In [34]:
ss = SearchService(text_fields=text_columns)

In [35]:
ss.fit(
    data.to_dict(orient='records'),
    vectorisor_params={"stop_words": "english", "min_df": 5}
)

In [36]:
ss.search_top_results(
    query='I just signed up, is it too late to start the course?',
    num_results=5,
    boost={'question': 3},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'section': 'General course-related questions',
  'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course a

**Now we will test with embeddings but just with text data**

In [37]:
from sklearn.decomposition import TruncatedSVD, NMF

In [38]:
cv = ss.vectorisors['text']
X = ss.matrices['text']

In [39]:
svd = TruncatedSVD(random_state=42, n_components=16)
X_emb = svd.fit_transform(X)

In [40]:
query = "I just signed up, is it too late to start the course?"
query_processed = cv.transform([query])
query_emb = svd.transform(query_processed)
score = cosine_similarity(X_emb, query_emb).flatten()
top_results = np.argsort(score)[-5:]
data.iloc[top_results].to_dict(orient='records')

[{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'section': 'General course-related questions',
  'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
  'section': 'General course-related questions',
  'question': 'Can I submit the homework after the due date?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlde

In [41]:
class SearchServiceWithEmbeddings:
    def __init__(self, text_fields: list[str]):
        self.data = pd.DataFrame()
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorisors = {}
        self.embeddors = {}

    def fit(self, records: dict, vectorisor_params: dict={}, max_embeddings: int=16):
        self.data = pd.DataFrame(records)
        for fields in self.text_fields:
            cv = TfidfVectorizer(**vectorisor_params)
            X = cv.fit_transform(self.data[fields])
            max_words = max([len(sentence.split(" ")) for sentence in self.data[field]])
            if max_words > 3*max_embeddings:
                svd = TruncatedSVD(random_state=42, n_components=max_embeddings)
                X = svd.fit_transform(X)
                self.embeddors[fields] = svd
            self.matrices[fields] = X
            self.vectorisors[fields] = cv

    def search_top_results(self,
                           query: str,
                           filters: dict={},
                           boost: dict={},
                           num_results: int=5):

        score = np.zeros(self.data.shape[0])
        for fields in self.text_fields:
            processed_query = self.vectorisors[fields].transform([query])
            emb_query = self.embeddors[fields].transform(processed_query) if self.embeddors.get(fields) else processed_query.copy()
            f_score = cosine_similarity(self.matrices[fields], emb_query).flatten()
            
            importance = boost.get(fields, 1.0)
            score = score + f_score*importance
        for f in filters:
            mask = (self.data[f] == filters[f]).astype(int)
            score = score*mask

        top_indexes = np.argsort(score)[-num_results:]
        results = self.data.iloc[top_indexes]
        return results.to_dict(orient='records') 

In [42]:
sse = SearchServiceWithEmbeddings(text_fields=text_columns)
sse.fit(
    data.to_dict(orient='records'),
    vectorisor_params={"stop_words": "english", "min_df": 5}
)

In [43]:
sse.search_top_results(query='I just signed up, is it too late to start the course?',
    num_results=5,
    boost={'question': 2},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced mode?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "No, you can only get a certificat

**Lets try with NMF**

In [44]:
class SearchServiceWithEmbeddings:
    def __init__(self, text_fields: list[str]):
        self.data = pd.DataFrame()
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorisors = {}
        self.embeddors = {}

    def fit(self, records: dict, vectorisor_params: dict={}, max_embeddings: int=16):
        self.data = pd.DataFrame(records)
        for fields in self.text_fields:
            cv = TfidfVectorizer(**vectorisor_params)
            X = cv.fit_transform(self.data[fields])
            max_words = max([len(sentence.split(" ")) for sentence in self.data[field]])
            if max_words > 3*max_embeddings:
                svd = NMF(random_state=42, n_components=max_embeddings, max_iter=1000)
                X = svd.fit_transform(X)
                self.embeddors[fields] = svd
            self.matrices[fields] = X
            self.vectorisors[fields] = cv

    def search_top_results(self,
                           query: str,
                           filters: dict={},
                           boost: dict={},
                           num_results: int=5):

        score = np.zeros(self.data.shape[0])
        for fields in self.text_fields:
            processed_query = self.vectorisors[fields].transform([query])
            emb_query = self.embeddors[fields].transform(processed_query) if self.embeddors.get(fields) else processed_query.copy()
            f_score = cosine_similarity(self.matrices[fields], emb_query).flatten()
            
            importance = boost.get(fields, 1.0)
            score = score + f_score*importance
        for f in filters:
            mask = (self.data[f] == filters[f]).astype(int)
            score = score*mask

        top_indexes = np.argsort(score)[-num_results:]
        results = self.data.iloc[top_indexes]
        return results.to_dict(orient='records') 

In [45]:
sse = SearchServiceWithEmbeddings(text_fields=text_columns)
sse.fit(
    data.to_dict(orient='records'),
    vectorisor_params={"stop_words": "english", "min_df": 5}
)

In [46]:
sse.search_top_results(query='I just signed up, is it too late to start the course?',
    num_results=5,
    boost={'question': 2},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're stil

**Now we will try bert to get some semantics going**

In [47]:
import torch
from transformers import BertModel, BertTokenizer
from tqdm.auto import tqdm

In [48]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [49]:
my_list = [1,2,3,4,5,3,2,3,4,5,1]
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

# Testing my make batches

In [50]:
make_batches(my_list, 3)

[[1, 2, 3], [4, 5, 3], [2, 3, 4], [5, 1]]

In [51]:
def make_batches(seq, batch_size: int):
    result = []
    for i in range(0, len(seq), batch_size):
        batch = seq[i:i+batch_size]
        result.append(batch)
    return result

In [52]:
data = ["Andes wakes in the morning and calls his dad, saying `ba` `ba` `ga`",
        "Dad is always running late for the morning shift with the baby",
        "Andes the baby is ready to play as soon he wakes up",
        "Andes the baby doesnt enjoy the morning routine around changing nappy, brushing his teeth, and putting on clothes",
        "Mom wants Andes to eat something like a porridge, or some food, but Andes most days reject and just want mum's milk",
        "Andes loves to chase the cats and play with the ball"]

df = pd.DataFrame(data={"text":data})

In [53]:
df

Unnamed: 0,text
0,"Andes wakes in the morning and calls his dad, ..."
1,Dad is always running late for the morning shi...
2,Andes the baby is ready to play as soon he wak...
3,Andes the baby doesnt enjoy the morning routin...
4,Mom wants Andes to eat something like a porrid...
5,Andes loves to chase the cats and play with th...


In [54]:
def compute_embeddings(texts, batch_size=8):
    
    #Creating and running  in batches
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):

        # Encoding the text to send to bert model
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            # Running the torch without gradient descent
            
            # Getting the output by calling the model
            outputs = model(**encoded_input)

            # Extracting the hidden state embeddings
            hidden_states = outputs.last_hidden_state

            # Getting the mean of the embeddings
            batch_embeddings = hidden_states.mean(dim=1)

            # Getting them in an array
            batch_embeddings_np = batch_embeddings.cpu().numpy()

            # Appending the embeddings
            all_embeddings.append(batch_embeddings_np)

    # Stacking all the batched embeddings, as a singular array
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [55]:
X_text = compute_embeddings(df.text.to_list(), batch_size=2)

  0%|          | 0/1 [00:00<?, ?it/s]

In [56]:
query='What are morning tasks for dad?'
query_emb = compute_embeddings([query])

  0%|          | 0/1 [00:00<?, ?it/s]

In [57]:
score = cosine_similarity(X_text, query_emb).flatten()
top_results = np.argsort(-score)[:2]
df.iloc[top_results].to_dict(orient='records')

[{'text': 'Andes the baby doesnt enjoy the morning routine around changing nappy, brushing his teeth, and putting on clothes'},
 {'text': 'Dad is always running late for the morning shift with the baby'}]