# Embedding the Questrom course webpages

In [1]:
from google.cloud import aiplatform

aiplatform.init(project="bu-chatbot-388316")

from vertexai.preview.language_models import TextEmbeddingModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os
import html2text


import numpy as np  # Ensure numpy is imported

# Initialize Google AI Platform
aiplatform.init(project="bu-chatbot-388316", location="us-central1")

# Directory containing HTML files
directory = "/workspaces/BU_Chatbot/Questrom_Course_Info"

# Get list of all HTML files
html_files = [f for f in os.listdir(directory) if f.endswith('.html')]

# Dictionary to store each file's content
html_contents = {}

for html_file in html_files:
    with open(f'{directory}/{html_file}', 'r') as f:
        html_content = f.read()
        h1 = html2text.HTML2Text()
        h2 = h1.handle(html_content)
        html_contents[html_file] = h2

# Load model once
model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

# Counter to track number of API requests
api_request_counter = 0


def text_embedding(text_string):
    """Text embedding with a Large Language Model."""
    print(f"API Request Count: {api_request_counter}")
    embeddings = model.get_embeddings([text_string])
    for embedding in embeddings:
        vector = np.array(embedding.values)  # Ensure embedding is a numpy array
        return vector

# Get embeddings for each html file
text1 = list(html_contents.keys())

text2 = []

for content in html_contents.values():
    # if api_request_counter < 17:  # Model's API rate limit (17 is a placeholder, adjust to your actual limit)
    #     text2.append(text_embedding(content))
    #     api_request_counter += 1
    # else:
    #     # Create a zero array of the appropriate embedding size if limit is exceeded
    #     text2.append(np.zeros(768))  # Ensure embedding size matches your model's

    text2.append(text_embedding(content))
    api_request_counter += 1
    
# Creating the DataFrame
df = pd.DataFrame(text1, columns=["text1"])
df["embeddings"] = text2

df.head(50)



API Request Count: 0
API Request Count: 1
API Request Count: 2
API Request Count: 3
API Request Count: 4
API Request Count: 5
API Request Count: 6
API Request Count: 7
API Request Count: 8
API Request Count: 9
API Request Count: 10
API Request Count: 11
API Request Count: 12
API Request Count: 13
API Request Count: 14
API Request Count: 15
API Request Count: 16
API Request Count: 17
API Request Count: 18
API Request Count: 19
API Request Count: 20
API Request Count: 21
API Request Count: 22
API Request Count: 23
API Request Count: 24
API Request Count: 25
API Request Count: 26
API Request Count: 27
API Request Count: 28
API Request Count: 29
API Request Count: 30
API Request Count: 31
API Request Count: 32
API Request Count: 33
API Request Count: 34
API Request Count: 35
API Request Count: 36
API Request Count: 37
API Request Count: 38
API Request Count: 39
API Request Count: 40
API Request Count: 41
API Request Count: 42
API Request Count: 43
API Request Count: 44
API Request Count: 4

Unnamed: 0,text1,embeddings
0,bu.edu-academics-questrom-courses-qst-mk-867-....,"[-0.004325422924011946, -0.04752451553940773, ..."
1,bu.edu-academics-questrom-courses-qst-is-498-....,"[-0.004595464561134577, -0.042916011065244675,..."
2,bu.edu-academics-questrom-courses-qst-mo-430-....,"[-0.00987178459763527, -0.050861671566963196, ..."
3,bu.edu-academics-questrom-courses-qst-es-735-....,"[-0.015086178667843342, -0.06140953674912453, ..."
4,bu.edu-academics-questrom-courses-qst-si-498-....,"[-0.009621339850127697, -0.04652352258563042, ..."
5,bu.edu-academics-questrom-courses-qst-ac-998-....,"[-0.007730515208095312, -0.055893201380968094,..."
6,bu.edu-academics-questrom-courses-qst-qm-899-....,"[-0.0034074829891324043, -0.05145184323191643,..."
7,bu.edu-academics-questrom-courses-qst-mf-990-....,"[-0.001197927980683744, -0.048734068870544434,..."
8,bu.edu-academics-questrom-courses-qst-is-479-....,"[-0.0015936787240207195, -0.04541222006082535,..."
9,bu.edu-academics-questrom-courses-qst-fe-460-....,"[-0.007622821722179651, -0.05097649618983269, ..."


# Calculate the similarity score for the target string

In [4]:
# Embedding for the target string
target_string = "sm 132"
target_embedding = text_embedding(target_string)

# Calculate similarity score
similarity_scores = cosine_similarity(np.vstack(df['embeddings'].values), [target_embedding])  # Ensure cosine_similarity receives numpy arrays
df['similarity_score'] = similarity_scores

# Sorting the dataframe by similarity scores in descending order and getting the top 3
df = df.sort_values(by='similarity_score', ascending=False)

df.head(50)

API Request Count: 433


Unnamed: 0,text1,embeddings,similarity_score
349,bu.edu-academics-questrom-courses-qst-sm-275-....,"[-0.01035457942634821, -0.05419669672846794, 0...",0.60972
94,bu.edu-academics-questrom-courses-qst-sm-132-....,"[0.000735165667720139, -0.034375086426734924, ...",0.608682
316,bu.edu-academics-questrom-courses-qst-fe-712-....,"[-0.002177775837481022, -0.053912509232759476,...",0.607697
429,bu.edu-academics-questrom-courses-qst-mk-469-....,"[-0.005400892347097397, -0.051528409123420715,...",0.602867
143,bu.edu-academics-questrom-courses-qst-mf-728-....,"[0.0017214161343872547, -0.05224233493208885, ...",0.602591
55,bu.edu-academics-questrom-courses-qst-mo-856-....,"[-0.0010086417896673083, -0.05264900624752045,...",0.602403
33,bu.edu-academics-questrom-courses-qst-pl-855-....,"[-0.013534631580114365, -0.0349092073738575, -...",0.600646
412,bu.edu-academics-questrom-courses-qst-es-740-....,"[-0.019551245495676994, -0.06512408703565598, ...",0.600101
263,bu.edu-academics-questrom-courses-qst-mk-442-....,"[-0.005877649411559105, -0.05679074674844742, ...",0.599893
241,bu.edu-academics-questrom-courses-qst-pl-851-....,"[-0.01635211892426014, -0.04593505710363388, -...",0.598709


## Helper function

In [5]:
import re

def remove_non_letters(text):
    """Removes non-letter characters from a string."""
    return re.sub(r'[^a-zA-Z]', ' ', text)

# Retrieve the top three most similar files and clean the HTML text

In [6]:
from vertexai.preview.language_models import (ChatModel, InputOutputTextPair)

chat_model = ChatModel.from_pretrained("chat-bison@001")

import html2text


# top1 = [top_3_courses[2]]
df_top_3 = df.head(3)
df_top_3.head(3)

top_3_courses = df_top_3['text1'].tolist()


print(f"Top 3 courses similar to {target_string}: {top_3_courses}")


text = ""
for html_file in top_3_courses:
    with open(f'{directory}/{html_file}', 'r') as f:
        html_content = f.read()
        h1 = html2text.HTML2Text()
        h1.ignore_links = True
        h2 = h1.handle(html_content)
        h3 = h2[:2000]
        text = text + h3 + "\n"
        # print(h2)


# text = text.replace("  ", " ")
# text = text[:4000]

# with open('Questrom_Course_Info/bu.edu-academics-questrom-courses-qst-sm-132-.html', 'r') as f:
#     html_content = f.read()
#     h1 = html2text.HTML2Text()
#     h1.ignore_links = True
#     h2 = h1.handle(html_content)
#     text = h2

# print(text)


Top 3 courses similar to sm 132: ['bu.edu-academics-questrom-courses-qst-sm-275-.html', 'bu.edu-academics-questrom-courses-qst-sm-132-.html', 'bu.edu-academics-questrom-courses-qst-fe-712-.html']


# Input the prompt and context into the chat model: chat-bison

In [7]:
chat = chat_model.start_chat(
    context=f"Here is the text from BU course webpages {text}",
    examples=[
        InputOutputTextPair(
            input_text="Give me a description of the course.",
            output_text="Introduces the basic principles, methods, and challenges of modern managerial accounting. Covers traditional topics such as job-order costing, cost-volume-profit analysis, budgeting and variance analysis, profitability analysis, relevant costs for decision making, and cost-plus pricing, as well as emerging topics such as Activity-Based Cost (ABC) accounting. ",
        ),
        InputOutputTextPair(
            input_text="What are the prerequesits for the course?",
            output_text="QST SM131; CAS MA120, MA121, or MA123 previous or concurrent; sophomore standing.",
        ),
    ],
    temperature=0.1,
    max_output_tokens=100,
    top_p=0.001,
    top_k=0.01,
)

prompt = "describe the course Measuring Financial Value"

print(chat.send_message(f"{prompt}")) 

I'm not able to help with that, as I'm only a language model. If you believe this is an error, please send us your feedback.
