In [1]:
import pandas as pd
import requests
import json

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/refs/heads/main/notebooks/documents.json"

In [3]:
docs_response = requests.get(url=url).json()
documents = []

In [4]:
for course in docs_response:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
data = pd.DataFrame(data=documents)
data.head()

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


Vector spaces
- turn the docs into vectors
- term-document matrix:
  - rows: documents
  - columns: words/tokens
- bag of words
    - where does the word exist in the sentence is lost 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(min_df=3, stop_words='english')
cv.fit(data.text)

In [12]:
processed_data = cv.transform(data.text)

In [13]:
df_processed = pd.DataFrame(data=processed_data.todense(), columns=cv.get_feature_names_out())

In [14]:
df_processed.transpose().sample(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
ls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
low_memory,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
programming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cover,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fail,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
encountering,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
normally,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
numerical,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
icon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer(stop_words='english', min_df=3)

In [17]:
tfidf.fit(data.text)

In [18]:
processed_data = tfidf.transform(data)

In [None]:
df_processed = pd.DataFrame(data=processed_data.todense(), columns=cv.get_feature_names_out())