# Text-based lexical (keyword) search engine:

# Install

In [None]:
!pip install requests pandas scikit-learn numpy

# 1.1 Load JSON from GitHub

In [54]:
import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


docs_url = 'https://raw.githubusercontent.com/Mamdouh-Muhammad/llm/refs/heads/main/rk20.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

# 1.2 Flatten the JSON into a list of documents

In [55]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# 1.3 Create a Pandas DataFrame

In [56]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
#df.head()

# (Optional) Implementing text search using Scikit-Learn 
# (BoW=Ignore of grammar, word order, syntax.. only word frequency)

In [None]:
example_text = [
    "The lecture provides an introduction to the fundamentals of computer networking",
    "It follows a top-down approach through the five key layers of the Internet protocol stack",
    "Application Layer: Covers protocols such as HTTP, DNS, and SMTP",
    "Transport Layer: Focuses on TCP, UDP, flow control, and congestion control",
    "Network Layer: Includes IP addressing, routing algorithms, and packet forwarding",
    "Data Link Layer: Discusses access methods, error detection, and correction techniques",
    "Physical Layer: Introduces transmission media, signals, and modulation techniques",
    "Security is addressed as a cross-cutting concern across all layers",
    "Analytical models are used to understand performance metrics and network design",
    "The exercise sessions include both practical programming tasks and theoretical problem sets",
    "The goal is to develop a deep understanding of the structure and operation of modern communication networks"
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(example_text)

In [None]:
names = cv.get_feature_names_out()
names

In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

# Implementing text search using TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(example_text)

In [None]:
names = cv.get_feature_names_out()
names

In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

# Query-Document Similarity

In [None]:
query = "What is application layer?"

q = cv.transform([query])
q.toarray()

In [None]:
query_dict = dict(zip(names, q.toarray()[0]))

In [None]:
X.dot(q.T).toarray()

# 1.4 Setup for TF-IDF Vectorization

In [57]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

# 1.5 TF-IDF Transformation Loop

In [58]:
for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=2)
    X = cv.fit_transform(df[field].fillna('').astype(str))
    transformers[field] = cv
    matrices[field] = X

# 1.6 Access Features and Matrix

In [72]:
transformers['question'].get_feature_names_out()
matrices['question']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 29 stored elements and shape (19, 11)>

In [99]:
query = "What social events are planned near semester end?"

query_vector = transformers['question'].transform([query])
score = cosine_similarity(matrices['question'], query_vector).flatten()

indices  = np.argsort(-score)[:2]

results = df.iloc[indices].copy()
results['similarity_score'] = score[indices]

In [100]:
pd.set_option('display.max_colwidth', None)
print("\nTop 2 Matching Results:")
print(results[['text', 'similarity_score']])


Top 2 Matching Results:
                                                                                                                                                                                                                                                                                      text  \
1  [Check the expected number of students that will enroll., Plan the number of tutors and time slots based on expected student count., Contact students to join as tutors, starting with former tutors., Email previous year students based on RK grades and notify Erika for paperwork.]   
8                                                                                                                                                                                                                                    [Invite tutors to the yearly Lehrstuhlgrillen event.]   

   similarity_score  
1               1.0  
8               1.0  
