In [1]:
#%pip install -Uq accelerate multiprocess

In [2]:
#%pip install --q langchain langchain_community openai faiss-cpu pandas tiktoken sentence-transformers

In [3]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [4]:
# Initialize Sentence Transformer embeddings
model_name = "all-MiniLM-L6-v2" 
embeddings = HuggingFaceEmbeddings(model_name=model_name)

  from tqdm.autonotebook import tqdm, trange
2024-12-10 16:44:15.489170: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
df = pd.read_csv("../courses_csv/combined_dataset.csv")
df.head(2)

Unnamed: 0,title,sub_info,subject,rating,level,institution,about,what_you_will_learn,syllabus,course_url,skills_you_will_gain
0,HarvardX: The Architectural Imagination,Learn fundamental principles of architecture —...,Architecture,,Introductory,HarvardX,Architecture engages a culture’s deepest socia...,"How to read, analyze, and understand different...",Part I: Form and History Part II: The Technolo...,https://www.edx.org/learn/architecture/harvard...,
1,MITx: Sustainable Building Design,"Learn and explore key scientific principles, t...",Architecture,,Intermediate,MITx,"Meeting growing global energy demand, while mi...",Understand the scientific principles underlyin...,Week 1 - Energy Use in Buildings Week 2 - Unde...,https://www.edx.org/learn/sustainable-developm...,


In [9]:
# Function to prepare data for embedding
def prepare_data_for_embedding(row):
    combined_info = f"""
    Title: {row['title']}
    Subject: {row['subject']}
    Description: {row['sub_info'] if pd.notna(row['sub_info']) else 'no data'}
    Level: {row['level'] if pd.notna(row['level']) else 'no data'}
    Institution: {row['institution'] if pd.notna(row['institution']) else 'no data'}
    About: {row['about'] if pd.notna(row['about']) else 'no data'}
    What You Will Learn: {row['what_you_will_learn'] if pd.notna(row['what_you_will_learn']) else 'no data'}
    Syllabus: {row['syllabus'] if pd.notna(row['syllabus']) else 'no data'}
    Skills You Will Gain: {row['skills_you_will_gain'] if pd.notna(row['skills_you_will_gain']) else 'no data'}
    Rating: {row['rating'] if pd.notna(row['rating']) else 'no data'}
    Course URL: {row['course_url']}
    """
    
    return combined_info.strip()

In [10]:
# Prepare data for embedding
df['combined_info'] = df.apply(prepare_data_for_embedding, axis=1)

In [11]:
df['combined_info'][0]

"Title: HarvardX: The Architectural Imagination\n    Subject: Architecture\n    Description: Learn fundamental principles of architecture — as an academic subject or a professional career — by studying some of history’s most important buildings.\n    Level: Introductory\n    Institution: HarvardX\n    About: Architecture engages a culture’s deepest social values and expresses them in material, aesthetic form. This course will teach you how to understand architecture as both cultural expression and technical achievement. Vivid analyses of exemplary buildings, and hands-on exercises in drawing and modeling, will bring you closer to the work of architects and historians. The first part of the course introduces the idea of the architectural imagination. Perspective drawing and architectural typology are explored and you will be introduced to some of the challenges in writing architectural history. Then we address technology as a component of architecture. You will discover ways that innova

In [13]:
#Create a vector store using FAISS
texts = df['combined_info'].tolist()

metadatas = df[['title', 'sub_info', 'rating', 'subject', 'level', 'institution', 'course_url']].to_dict('records')

#Create the vector store
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

In [None]:
# Save the FAISS index
vectorstore.save_local("../faiss_index")

print("FAISS index saved to faiss_index directory")