In [25]:
import numpy as np
import pandas as pd
import requests

In [26]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.5.1
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Users/jagathkumarreddyk/miniconda3/envs/dsprojects/lib/python3.11/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: 


## Importing FAQs from github and transforiming it into dataframe

In [27]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [28]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
df = pd.DataFrame(documents, columns = ['course', 'question', 'section', 'text'])

In [30]:
df.sample(3)

Unnamed: 0,course,question,section,text
159,data-engineering-zoomcamp,Error: error starting userland proxy: listen t...,Module 1: Docker and Terraform,Resolution: You need to stop the services whic...
80,data-engineering-zoomcamp,Docker-Compose - Data retention (could not tr...,Module 1: Docker and Terraform,After executing `docker-compose up` - if you l...
7,data-engineering-zoomcamp,Course - Can I follow the course after it fini...,General course-related questions,"Yes, we will keep all the materials after the ..."


In [31]:
df_de = df[df['course']=='data-engineering-zoomcamp'].copy()

In [32]:
df_de.sample(3)

Unnamed: 0,course,question,section,text
121,data-engineering-zoomcamp,Python - Pandas can read *.csv.gzip,Module 1: Docker and Terraform,"When a CSV file is compressed using Gzip, it i..."
281,data-engineering-zoomcamp,How to automatically infer the column data typ...,Module 4: analytics engineering with dbt,"Problem: when injecting data to bigquery, you ..."
136,data-engineering-zoomcamp,GCP VM - mkdir: cannot create directory ‘.ssh’...,Module 1: Docker and Terraform,I am trying to create a directory but it won't...


In [33]:
len(df_de), len(df)

(435, 948)

## Text searching - vector spaces
- turn the docs into vectors
- term-document matrix:
  - rows : documents
  - columns: words/tokens
- Bag of words:
   - the order of the words is lost
   - semantic meaning is comprised 

In [34]:
##Now dispecting the questions into words/tokens for easier search
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer(stop_words = 'english',min_df = 5)

In [36]:
cv.fit(df['question'])

In [37]:
cv.get_feature_names_out(), len(cv.get_feature_names_out())

(array(['10', '11', '2022', '403', '5432', 'access', 'account', 'accuracy',
        'address', 'airflow', 'alternative', 'api', 'argument',
        'attribute', 'attributeerror', 'available', 'aws', 'bad',
        'bigquery', 'bq', 'browser', 'bucket', 'build', 'building',
        'built', 'change', 'check', 'class', 'cli', 'cloud', 'code',
        'codespaces', 'colab', 'column', 'columns', 'com', 'command',
        'commands', 'commit', 'compose', 'compute', 'conda', 'connect',
        'connecting', 'connection', 'contain', 'container', 'copy',
        'course', 'create', 'created', 'creating', 'credentials', 'csv',
        'daemon', 'data', 'dataframe', 'dataset', 'datasets', 'date',
        'dbt', 'denied', 'dependencies', 'deploy', 'dictvectorizer', 'did',
        'difference', 'different', 'directly', 'directory', 'docker',
        'does', 'doesn', 'environment', 'error', 'errors', 'executing',
        'exist', 'external', 'fact_trips', 'failed', 'fails', 'fatal',
        'featur

In [38]:
t = cv.transform(df['text'])
t.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [39]:
pd.DataFrame(t.todense(), columns = cv.get_feature_names_out())

Unnamed: 0,10,11,2022,403,5432,access,account,accuracy,address,airflow,...,way,week,wget,windows,work,working,write,wsl,xgboost,yaml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Term-Frequecy Inverse Document-Frequency 
- this enables to understand the importance of a word in a text

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfv = TfidfVectorizer(min_df = 5,stop_words = 'english')
tfv.fit(df['text'])

In [42]:
tfv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [43]:
g = tfv.transform(df['text'])
g.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.42896052],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.14842753]])

In [44]:
df_tf = pd.DataFrame(g.todense(), columns = tfv.get_feature_names_out())
df_tf.round(2)

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.43
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.28,0.00,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.11,0.0,0.0,0.00
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.17,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00


In [45]:
query = 'Do i have to be a master in Python to be successful in this course?'
q_tfv = tfv.transform([query])
q_tfv.todense()


matrix([[0., 0., 0., ..., 0., 0., 0.]])

### Query and cosine similarity
- the query is tfidf vectorized and compared to all the documents
- the comparision is done using cosine similarity or simply dot product
- the document with highest score is the most revelant

In [56]:
from sklearn.metrics.pairwise import cosine_similarity


In [57]:
## Simple dot product
g.dot(q_tfv.T)
#print(f'The cosine similarity array is {g.dot(q_tfv.T).todense()}')
## index of max of cosine similarity arry/matrix
ans_ind = g.dot(q_tfv.T).argmax()
ans_ind

27

In [58]:
score = cosine_similarity(g,q_tfv).flatten()
score

array([0.29838175, 0.        , 0.        , 0.        , 0.07561936,
       0.        , 0.        , 0.20660497, 0.        , 0.        ,
       0.        , 0.18675813, 0.        , 0.        , 0.        ,
       0.11392012, 0.        , 0.        , 0.08859977, 0.11103847,
       0.        , 0.07931957, 0.25100041, 0.1238019 , 0.        ,
       0.        , 0.        , 0.32298781, 0.08332405, 0.        ,
       0.        , 0.        , 0.        , 0.06076033, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.08400764, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.03786343, 0.04142573, 0.0921833 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.06390916, 0.        , 0.        , 0.        , 0.01154208,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [59]:
np.argsort(score)[-5:]

array([596,   0, 764, 353,  27])

In [60]:
df.loc[27,['question','text']]

question    Environment - The GCP and other cloud provider...
text        You can do most of the course without a cloud....
Name: 27, dtype: object

In [61]:
## Lets get cosine score for text, question, section/course and get the sum of the scores for finding a most relevant 
fields  = {'section', 'question', 'text', 'course'}
matrices = {}
vectorizers = {}

for f in fields:
    tf_f = TfidfVectorizer(min_df = 5, stop_words = 'english')
    tf_f.fit(df[f])
    matrices[f] = tf_f.transform(df[f])
    vectorizers[f] = tf_f


In [62]:
vectorizers

{'course': TfidfVectorizer(min_df=5, stop_words='english'),
 'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english')}

In [63]:
matrices

{'course': <948x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 2706 stored elements in Compressed Sparse Row format>,
 'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>}

In [112]:
## writing a function to report sum of cosine similarity of all fields
def combined_CS_scores(df,query,vectorizers,matrices,fields):
    n = np.zeros(len(df))
    for f in fields:
        v = vectorizers[f]
        m = matrices[f]
        q_v = v.transform([query])
        n += cosine_similarity(m,q_v).flatten()
    return n, np.argsort(n)[-5:]

query = 'Do i have to be a master in Python to be successful in this course?'
vectorizers = vectorizers
matrices = matrices
df = df
fields = fields
score, indexes = combined_CS_scores(df,query,vectorizers,matrices,fields)
indexes

array([  1, 452,   4,   0,   7])

In [111]:
print(f'{query} \n')
for i in indexes:
    print(f'for {i} ---> \n question = {df.loc[i,"question"]} \n {df.loc[i,"text"]}')

Do i have to be a master in Python to be successful in this course? 

for 1 ---> 
 question = Course - What are the prerequisites for this course? 
 GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites
for 452 ---> 
 question = I just joined. What should I do next? How can I access course materials? 
 Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.
Click on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.
Or you can just use this link: http://mlzoomcamp.com/#syllabus
for 4 ---> 
 question = Course - What can I do before the course starts? 
 You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installe

In [65]:
for i in indexes:


SyntaxError: incomplete input (4216323989.py, line 1)

### Example excerise

In [66]:
##Example excerise
raw_doc = {"""
I was supposed to be sent away
But they forgot to come and get me
I was a functioning alcoholic
'Til nobody noticed my new aesthetic
All of this to say I hope you're okay
But you're the reason
And no one here's to blame
But what about your quiet treason? """,


"""And for a fortnight there, we were forever
Run into you sometimes, ask about the weather
Now you're in my backyard, turned into good neighbors
Your wife waters flowers, I wanna kill her""",

"""All my mornings are Mondays stuck in an endless February
I took the miracle move-on drug, the effects were temporary
And I love you, it's ruining my life
I love you, it's ruining my life
I touched you for only a fortnight
I touched you, but I touched you""",

"""And for a fortnight there, we were forever
Run into you sometimes, ask about the weather
Now you're in my backyard, turned into good neighbors
Your wife waters flowers, I wanna kill her
And for a fortnight there, we were together
Run into you sometimes, comment on my sweater
Now you're at the mailbox, turned into good neighbors
My husband is cheating, I wanna kill him' """}


In [67]:
cv_1 = CountVectorizer()
cv_1.fit(raw_doc)

In [68]:
allwords = cv_1.get_feature_names_out()
cv_1.get_feature_names_out() , len(cv_1.get_feature_names_out())

(array(['about', 'aesthetic', 'alcoholic', 'all', 'an', 'and', 'are',
        'ask', 'at', 'away', 'backyard', 'be', 'blame', 'but', 'cheating',
        'come', 'comment', 'drug', 'effects', 'endless', 'february',
        'flowers', 'for', 'forever', 'forgot', 'fortnight', 'functioning',
        'get', 'good', 'her', 'here', 'him', 'hope', 'husband', 'in',
        'into', 'is', 'it', 'kill', 'life', 'love', 'mailbox', 'me',
        'miracle', 'mondays', 'mornings', 'move', 'my', 'neighbors', 'new',
        'no', 'nobody', 'noticed', 'now', 'of', 'okay', 'on', 'one',
        'only', 'quiet', 're', 'reason', 'ruining', 'run', 'say', 'sent',
        'sometimes', 'stuck', 'supposed', 'sweater', 'temporary', 'the',
        'there', 'they', 'this', 'til', 'to', 'together', 'took',
        'touched', 'treason', 'turned', 'wanna', 'was', 'waters', 'we',
        'weather', 'were', 'what', 'wife', 'you', 'your'], dtype=object),
 92)

In [69]:
z = cv_1.transform(raw_doc)
z.todense()

matrix([[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
         1, 1, 1, 1, 0, 1, 2, 1],
        [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
         0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 2, 2, 0,
         0, 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2,
         0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 5, 0],
        [1, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 3, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 2, 1, 0,
         0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 4, 0, 0, 0, 1, 0, 0, 2,
         0, 0, 0, 0, 1, 0, 2, 1],
        [1, 0, 0, 0, 0, 2, 0, 1, 1

In [70]:
pd.DataFrame(z.todense(), columns = cv_1.get_feature_names_out())

Unnamed: 0,about,aesthetic,alcoholic,all,an,and,are,ask,at,away,...,wanna,was,waters,we,weather,were,what,wife,you,your
0,1,0,0,0,0,1,0,1,0,0,...,1,0,1,1,1,1,0,1,2,1
1,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,5,0
2,1,1,1,1,0,2,0,0,0,1,...,0,2,0,0,0,0,1,0,2,1
3,1,0,0,0,0,2,0,1,1,0,...,2,0,1,2,1,2,0,1,4,1


In [71]:
cv_2 = CountVectorizer(stop_words = 'english')

In [72]:
cv_2.fit(raw_doc)

In [73]:
cv_2.get_feature_names_out(), len(cv_2.get_feature_names_out())

(array(['aesthetic', 'alcoholic', 'ask', 'away', 'backyard', 'blame',
        'cheating', 'come', 'comment', 'drug', 'effects', 'endless',
        'february', 'flowers', 'forever', 'forgot', 'fortnight',
        'functioning', 'good', 'hope', 'husband', 'kill', 'life', 'love',
        'mailbox', 'miracle', 'mondays', 'mornings', 'neighbors', 'new',
        'noticed', 'okay', 'quiet', 'reason', 'ruining', 'run', 'say',
        'sent', 'stuck', 'supposed', 'sweater', 'temporary', 'til', 'took',
        'touched', 'treason', 'turned', 'wanna', 'waters', 'weather',
        'wife'], dtype=object),
 51)

In [74]:
z_1 = cv_2.transform(raw_doc)
z_1.todense()

matrix([[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1,
         0, 1, 3, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
         1, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 2, 0, 2, 0, 1,
         2, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 2, 2, 1, 1, 1]])

In [75]:
pd.DataFrame(z_1.todense(), columns = cv_2.get_feature_names_out()).T

Unnamed: 0,0,1,2,3
aesthetic,0,0,1,0
alcoholic,0,0,1,0
ask,1,0,0,1
away,0,0,1,0
backyard,1,0,0,1
blame,0,0,1,0
cheating,0,0,0,1
come,0,0,1,0
comment,0,0,0,1
drug,0,1,0,0
