# Preparing FAQ Vectorstore

In [2]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
os.environ['PINECONE_API_KEY']=os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_INDEX_NAME']=os.getenv('PINECONE_INDEX_NAME')

In [4]:
df=pd.read_csv("./Dataset/appendix.csv")

1. Advance/Beginner seperation -> %enrolled based (modify column)
2. Learn at own pace -> always true
3. what kind of assignment do you give -> static answer
4. Prerequirements -> No prerequirements (add column)
5. Course duration -> total course hours
6. Course Cost -> add column with random values
7. Doubt Support -> static answer
8. Placement Opportunities -> add random column
9. How is course different from others -> static answer
10. Money back gurantee -> static answer
11. Course relevancy -> use Launch date column to answer
12. Good DSA is prerequisites -> leave to model
13. How much I can make -> generate random column for range
14. is EMI Allowed -> if (paid) -> random
15. Financial Aid -> Add column (random)
16. Certification -> certification column
17. Where to get testimonials -> static answer(in course reviews)
18. Mentor -> Instructor
19. Online or Offline -> Add random column

In [5]:
df.columns

Index(['Institution', 'Course Number', 'Launch Date', 'Course Title',
       'Instructors', 'Course Subject', 'Year', 'Honor Code Certificates',
       'Participants (Course Content Accessed)',
       'Audited (> 50% Course Content Accessed)', 'Certified', '% Audited',
       '% Certified', '% Certified of > 50% Course Content Accessed',
       '% Played Video', '% Posted in Forum', '% Grade Higher Than Zero',
       'Total Course Hours (Thousands)', 'Median Hours for Certification',
       'Median Age', '% Male', '% Female', '% Bachelor's Degree or Higher'],
      dtype='object')

In [6]:
df=df[['Launch Date','Course Title','Instructors','Course Subject','Honor Code Certificates','Total Course Hours (Thousands)','% Bachelor\'s Degree or Higher']]

In [7]:
df.columns

Index(['Launch Date', 'Course Title', 'Instructors', 'Course Subject',
       'Honor Code Certificates', 'Total Course Hours (Thousands)',
       '% Bachelor's Degree or Higher'],
      dtype='object')

In [8]:
df.head(1)

Unnamed: 0,Launch Date,Course Title,Instructors,Course Subject,Honor Code Certificates,Total Course Hours (Thousands),% Bachelor's Degree or Higher
0,09/05/2012,Circuits and Electronics,Khurram Afridi,"Science, Technology, Engineering, and Mathematics",1,418.94,60.68


Condition 1

In [9]:
df['Course_Level']=df['% Bachelor\'s Degree or Higher'].apply(lambda x: 'Advanced' if x > 50 else 'Beginner')

Condition 4

In [10]:
df['Pre_Requirement']='No Prerequisite'

Condition 6

In [11]:
np.random.seed(0)
num_courses = len(df)
half_num_courses = num_courses // 2
free_courses_indices = np.random.choice(num_courses, half_num_courses, replace=False)
course_prices = np.zeros(num_courses)
remaining_indices = [i for i in range(num_courses) if i not in free_courses_indices]
remaining_prices = np.random.randint(0, 1001, len(remaining_indices))
remaining_prices = np.round(remaining_prices / 25) * 25
course_prices[remaining_indices] = remaining_prices

In [12]:
df['Course_Price'] = course_prices

Condition 8

In [13]:
df['Placement_Opportunity'] = np.random.choice(['Yes', 'No'], size=len(df))

Condition 11

In [14]:
df['Course_Relevancy'] = pd.to_datetime(df['Launch Date'], format='%m/%d/%Y').dt.year

Condition 13

In [15]:
df['Expected_Package'] = np.random.randint(3, 26, size=len(df))

Condition 14

In [16]:
df['EMI_Available'] = np.where(df['Course_Price'] > 0, np.random.choice(['Yes', 'No'], size=len(df)), 'No')

Condition 15

In [17]:
df['Financial_Aid'] = np.random.choice(['Yes', 'No'], size=len(df))

Condition 16

In [20]:
df['Offer_Certificate']=np.where(df['Honor Code Certificates'] == 1, 'Yes', 'No')

In [31]:
df.columns

Index(['Course Title', 'Instructors', 'Course Subject',
       'Total Course Hours (Thousands)', 'Course_Level', 'Pre_Requirement',
       'Course_Price', 'Placement_Opportunity', 'Course_Relevancy',
       'Expected_Package', 'EMI_Available', 'Financial_Aid',
       'Offer_Certificate'],
      dtype='object')

In [45]:
def create_string(row):
    if row['Course_Price']==0:
        return f"\"Title\": {row['Course Title']}, \"Level\": {row['Course_Level']}, \"Price\": Free, \"Duration\": {row['Total Course Hours (Thousands)']}, \"Pre_Requirement\": {row['Pre_Requirement']}, \"Placement_Opportunity\": {row['Placement_Opportunity']}, \"Course_Relevancy\": Based on {row['Course_Relevancy']} Market Demand, \"Expected_Package\": {row['Expected_Package']} to {row['Expected_Package']+1} LPA, \"EMI_Available\": {row['EMI_Available']}, \"Financial_Aid\": {row['Financial_Aid']}, \"Offer_Certificate\": {row['Offer_Certificate']}, \"Metadata\": {row['Course Subject']}"
    else:
        return f"\"Title\": {row['Course Title']}, \"Level\": {row['Course_Level']}, \"Price\": {row['Course_Price']}, \"Duration\": {row['Total Course Hours (Thousands)']}, \"Pre_Requirement\": {row['Pre_Requirement']}, \"Placement_Opportunity\": {row['Placement_Opportunity']}, \"Course_Relevancy\": Based on {row['Course_Relevancy']} Market Demand, \"Expected_Package\": {row['Expected_Package']} to {row['Expected_Package']+1} LPA, \"EMI_Available\": {row['EMI_Available']}, \"Financial_Aid\": {row['Financial_Aid']}, \"Offer_Certificate\": {row['Offer_Certificate']}, \"Metadata\": {row['Course Subject']}"

In [46]:
df['soup'] = df.apply(create_string, axis=1)

In [47]:
df['soup'][0]

'"Title": Circuits and Electronics, "Level": Advanced, "Price": 350.0, "Duration": 418.94, "Pre_Requirement": No Prerequisite, "Placement_Opportunity": Yes, "Course_Relevancy": Based on 2012 Market Demand, "Expected_Package": 8 to 9 LPA, "EMI_Available": Yes, "Financial_Aid": No, "Offer_Certificate": Yes, "Metadata": Science, Technology, Engineering, and Mathematics'

In [48]:
df.to_csv('./Dataset/updated_appendix.csv', index=False)

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [6]:
embeddings = HuggingFaceEmbeddings()

In [7]:
from langchain_pinecone import PineconeVectorStore

index_name = "edu-website-chatbot"

vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [52]:
for i in range(len(df['soup'])):
    vectorstore.add_texts([df['soup'][i]])

In [12]:
def retrieve_query(query,k=3):
    matching_reults=vectorstore.similarity_search(query,k=k)
    return matching_reults

In [None]:
# # Retrieve more documents with higher diversity
# # Useful if your dataset has many similar documents
# docsearch.as_retriever(
#     search_type="mmr",
#     search_kwargs={'k': 6, 'lambda_mult': 0.25}
# )
# # Fetch more documents for the MMR algorithm to consider
# # But only return the top 5
# docsearch.as_retriever(
#     search_type="mmr",
#     search_kwargs={'k': 5, 'fetch_k': 50}
# )
# # Only retrieve documents that have a relevance score
# # Above a certain threshold
# docsearch.as_retriever(
#     search_type="similarity_score_threshold",
#     search_kwargs={'score_threshold': 0.8}
# )
# # Only get the single most similar document from the dataset
# docsearch.as_retriever(search_kwargs={'k': 1})

In [13]:
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    return doc_search
    # response = chain.run(input_documents=doc_search,question=query)
    # return response

In [14]:
query="do you provide doubt support?"
results=retrieve_answers(query=query)
for answer in results:
    print(answer.page_content)

"Question": is there any doubt support? ; "Answer": Our courses offer comprehensive doubt support for enhanced learning, where the course mentors answers the doubts themselves 
"Question": Where to get testimonials? ; "Answer": Testimonials are available in our course reviews section for authentic feedback.
"Question": is there a money back gurantee? ; "Answer": We offer a money-back guarantee to ensure customer satisfaction.


adding answers to the static question of the FAQ database

1. Learn at own pace -> always true
3. what kind of assignment do you give -> static answer
7. Doubt Support -> static answer
9. How is course different from others -> static answer
10. Money back gurantee -> static answer
17. Where to get testimonials -> static answer(in course reviews)

In [8]:
arr=['"Question": Can student learn at their own pace? ; "Answer": yes, all the courses will help the students learn at their own pace by providing recorded video lectures',
     '"Question": What kind of assignments are available? ; "Answer": assignments are provided by the course mentors based on the course material, time to time as the course proceeds',
     '"Question": is there any doubt support? ; "Answer": Our courses offer comprehensive doubt support for enhanced learning, where the course mentors answers the doubts themselves ',
     '"Question": How is course different from others? ; "Answer":Our courses stand out through personalized learning experiences tailored to individual needs.',
     '"Question": is there a money back gurantee? ; "Answer": We offer a money-back guarantee to ensure customer satisfaction.',
     '"Question": Where to get testimonials? ; "Answer": Testimonials are available in our course reviews section for authentic feedback.']

In [10]:
for Str in arr:
    vectorstore.add_texts([Str])