In [None]:
import os
from typing import List
from langchain_community.document_loaders import UnstructuredLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import GPT4AllEmbeddings
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import json
import time
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [2]:
import os
os.environ['USER_AGENT'] = 'scrappy'

In [3]:
"""
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, DirectoryLoader, PyMuPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaLLM
"""

In [None]:
### Indexing

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import time
import re
import random

def get_professor_ids(url, max_retries=10):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    professor_ids = set()
    retry_count = 0

    while retry_count < max_retries:
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)  # Set page load timeout

        try:
            driver.get(url)
            while True:
                # Wait for the professor cards to load
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "TeacherCard__StyledTeacherCard-syjs0d-0"))
                )
                
                # Extract professor IDs from the current page
                cards = driver.find_elements(By.CLASS_NAME, "TeacherCard__StyledTeacherCard-syjs0d-0")
                for card in cards:
                    href = card.get_attribute('href')
                    if href:
                        match = re.search(r'/professor/(\d+)', href)
                        if match:
                            professor_ids.add(match.group(1))
                
                print(f"Scraped {len(professor_ids)} unique professor IDs so far...")
                
                try:
                    # Try to click the "Show More" button
                    show_more_button = WebDriverWait(driver, 20).until(
                        EC.element_to_be_clickable((By.CLASS_NAME, "Buttons__Button-sc-19xdot-1"))
                    )
                    driver.execute_script("arguments[0].click();", show_more_button)
                    time.sleep(random.uniform(2, 5))  # Random delay between 2 and 5 seconds
                except TimeoutException:
                    print("No more 'Show More' button found. Finished scraping.")
                    break

            # If we've made it here without exceptions, we're done
            break

        except TimeoutException as e:
            print(f"A timeout occurred: {e}")
            retry_count += 1
            sleep_time = random.uniform(300, 600)  # Sleep for 5-10 minutes (300-600 seconds)
            print(f"Sleeping for {sleep_time:.2f} seconds before retrying... (Attempt {retry_count} of {max_retries})")
            time.sleep(sleep_time)

        except WebDriverException as e:
            print(f"A WebDriver error occurred: {e}")
            retry_count += 1
            print(f"Retrying... (Attempt {retry_count} of {max_retries})")
            time.sleep(random.uniform(10, 20))  # Random delay between retries

        finally:
            driver.quit()

    if retry_count == max_retries:
        print("Max retries reached. Some data may be missing.")

    return list(professor_ids)

# Usage
url = "https://www.ratemyprofessors.com/search/professors/481?q=*"
professor_ids = get_professor_ids(url)

print(f"\nTotal unique professor IDs scraped: {len(professor_ids)}")
print("Sample of professor IDs:")
print(professor_ids[:10])

# Generate full URLs
base_url = "https://www.ratemyprofessors.com/professor/"
professor_urls = [f"{base_url}{id}" for id in professor_ids]

print("\nSample of professor URLs:")
print(professor_urls[:5])

Scraped 8 unique professor IDs so far...
Scraped 16 unique professor IDs so far...
Scraped 24 unique professor IDs so far...
Scraped 31 unique professor IDs so far...
Scraped 39 unique professor IDs so far...
Scraped 47 unique professor IDs so far...
Scraped 55 unique professor IDs so far...
Scraped 63 unique professor IDs so far...
Scraped 71 unique professor IDs so far...
Scraped 79 unique professor IDs so far...
Scraped 87 unique professor IDs so far...
Scraped 95 unique professor IDs so far...
Scraped 103 unique professor IDs so far...
Scraped 110 unique professor IDs so far...
Scraped 118 unique professor IDs so far...
Scraped 126 unique professor IDs so far...
Scraped 133 unique professor IDs so far...
Scraped 141 unique professor IDs so far...
Scraped 149 unique professor IDs so far...
Scraped 157 unique professor IDs so far...
Scraped 165 unique professor IDs so far...
Scraped 173 unique professor IDs so far...
Scraped 180 unique professor IDs so far...
Scraped 188 unique profe

In [23]:
def save_urls_to_file(urls, filename):
    """
    Save a list of URLs to a text file, with each URL on a new line.
    
    Args:
    urls (list): List of URLs to save
    filename (str): Name of the file to save the URLs to
    """
    with open(filename, 'w') as file:
        for url in urls:
            file.write(f"{url}\n")
    print(f"Saved {len(urls)} URLs to {filename}")

# Usage
filename = "professor_urls.txt"
save_urls_to_file(professor_urls, filename)

Saved 639 URLs to professor_urls.txt


In [26]:
def extract_professor_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract data from the JSON-LD script tag
    script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
    data = json.loads(script_tag.string)
    teacher_data = data['props']['pageProps']['teacherRatings']['node']
    
    # Extract relevant information
    name = f"{teacher_data['firstName']} {teacher_data['lastName']}"
    department = teacher_data['department']
    school = teacher_data['school']['name']
    overall_quality = teacher_data['avgRating']
    num_ratings = teacher_data['numRatings']
    would_take_again = teacher_data['wouldTakeAgainPercent']
    level_of_difficulty = teacher_data['avgDifficulty']
    
    # Extract top tags
    top_tags = [tag['tagName'] for tag in teacher_data['teacherRatingTags']]
    
    # Extract individual ratings (limited to 5 for brevity)
    ratings = []
    for rating in teacher_data['ratings']['edges'][:5]:
        rating_data = rating['node']
        ratings.append({
            'class': rating_data['class'],
            'date': rating_data['date'],
            'quality': rating_data['helpfulRating'],
            'difficulty': rating_data['difficultyRating'],
            'comment': rating_data['comment']
        })
    
    # Create structured document
    document = {
        'name': name,
        'department': department,
        'school': school,
        'overall_quality': overall_quality,
        'num_ratings': num_ratings,
        'would_take_again': would_take_again,
        'level_of_difficulty': level_of_difficulty,
        'top_tags': top_tags,
        'sample_ratings': ratings
    }
    
    return document

# Example usage
url = "https://www.ratemyprofessors.com/professor/1512959"
professor_document = extract_professor_info(url)
print(json.dumps(professor_document, indent=2))

AttributeError: 'NoneType' object has no attribute 'string'

In [24]:
def extract_professor_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
    data = json.loads(script_tag.string)
    teacher_data = data['props']['pageProps']['teacherRatings']['node']
    
    name = f"{teacher_data['firstName']} {teacher_data['lastName']}"
    department = teacher_data['department']
    school = teacher_data['school']['name']
    overall_quality = teacher_data['avgRating']
    num_ratings = teacher_data['numRatings']
    would_take_again = teacher_data['wouldTakeAgainPercent']
    level_of_difficulty = teacher_data['avgDifficulty']
    
    top_tags = [tag['tagName'] for tag in teacher_data['teacherRatingTags']]
    
    ratings = []
    for rating in teacher_data['ratings']['edges'][:10]:
        rating_data = rating['node']
        ratings.append({
            'class': rating_data['class'],
            'date': rating_data['date'],
            'quality': rating_data['helpfulRating'],
            'difficulty': rating_data['difficultyRating'],
            'comment': rating_data['comment']
        })
    
    document = {
        'name': name,
        'department': department,
        'school': school,
        'overall_quality': overall_quality,
        'num_ratings': num_ratings,
        'would_take_again': would_take_again,
        'level_of_difficulty': level_of_difficulty,
        'top_tags': top_tags,
        'sample_ratings': ratings,
        'url': url
    }
    
    return document

def document_to_text(doc):
    text = f"{doc['name']} is a professor in the {doc['department']} department at {doc['school']}. "
    text += f"Overall quality: {doc['overall_quality']}/5 based on {doc['num_ratings']} ratings. "
    text += f"{doc['would_take_again']}% would take again. Level of difficulty: {doc['level_of_difficulty']}/5. "
    text += f"Top tags: {', '.join(doc['top_tags'])}. "
    for rating in doc['sample_ratings']:
        text += f"Sample rating: {rating['quality']}/5, {rating['difficulty']}/5 difficulty, {rating['comment']} "
    return text

def load_professor_urls(filename):
    with open(filename, 'r') as file:
        return [line.strip() for line in file]

def index_professors(urls, embeddings, vectorstore):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    for url in tqdm(urls, desc="Indexing professors"):
        try:
            # Extract professor info
            prof_doc = extract_professor_info(url)
            
            # Convert to text
            prof_text = document_to_text(prof_doc)
            
            # Split text into chunks
            chunks = text_splitter.split_text(prof_text)
            
            # Create Langchain documents
            documents = [
                Document(
                    page_content=chunk,
                    metadata={
                        "name": prof_doc['name'],
                        "department": prof_doc['department'],
                        "school": prof_doc['school'],
                        "overall_quality": str(prof_doc['overall_quality']),
                        "num_ratings": str(prof_doc['num_ratings']),
                        "would_take_again": str(prof_doc['would_take_again']),
                        "level_of_difficulty": str(prof_doc['level_of_difficulty']),
                        "url": url
                    }
                ) for chunk in chunks
            ]
            
            # Add to vectorstore
            vectorstore.add_documents(documents)
            
            # Sleep to avoid rate limiting
            time.sleep(1)
        
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

# Usage
# Load professor URLs
professor_urls = load_professor_urls("professor_urls.txt")
    
# Initialize the embedding model
embeddings = GPT4AllEmbeddings()
    
# Initialize Chroma vectorstore
vectorstore = Chroma(embedding_function=embeddings, persist_directory="./chroma_db")
    
# Run indexing process
index_professors(professor_urls, embeddings, vectorstore)
    
# Persist the vectorstore
vectorstore.persist()
    
# Verify indexing
print(f"Total documents indexed: {vectorstore._collection.count()}")

NameError: name 'GPT4AllEmbeddings' is not defined

In [None]:
import os
from typing import List, Dict
from langchain_community.document_loaders import UnstructuredLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from tqdm import tqdm

def load_pdf(file_path: str) -> List[Document]:
    loader = UnstructuredLoader(
        file_path=file_path,
        strategy="hi_res",
        coordinates=True,
    )
    return list(loader.lazy_load())

def process_pdfs(directory: str) -> List[Document]:
    documents = []
    for filename in tqdm(os.listdir(directory), desc="Processing PDFs"):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            docs = load_pdf(file_path)
            # Add filename to metadata for better structure
            for doc in docs:
                doc.metadata["source_file"] = filename
            documents.extend(docs)
    return documents

def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    return text_splitter.split_documents(documents)

def load_course_info(directory: str) -> Dict[str, Dict[str, str]]:
    course_info = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as f:
                lines = f.readlines()
                for line in lines:
                    parts = line.strip().split(',')
                    if len(parts) == 3:
                        course_number, course_name, description = parts
                        course_info[course_number] = {
                            "name": course_name,
                            "description": description
                        }
    return course_info

def enrich_documents(documents: List[Document], course_info: Dict[str, Dict[str, str]]) -> List[Document]:
    enriched_docs = []
    for doc in documents:
        # Check if the document contains a course number
        for course_number in course_info.keys():
            if course_number in doc.page_content:
                doc.metadata["course_number"] = course_number
                doc.metadata["course_name"] = course_info[course_number]["name"]
                doc.metadata["course_description"] = course_info[course_number]["description"]
                break
        enriched_docs.append(doc)
    return enriched_docs

def index_documents(documents: List[Document]):
    embeddings = GPT4AllEmbeddings()
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory="./ksu_catalog_db"
    )
    vectorstore.persist()
    return vectorstore

def main():
ksu_data_dir = "./ksu_data"
pdf_directory = os.path.join(ksu_data_dir, "undergraduate_catalogs")
course_info_directory = os.path.join(ksu_data_dir, "undergraduate_categories")
    
print("Loading course information...")
course_info = load_course_info(course_info_directory)
    
print("Loading and processing PDFs...")
documents = process_pdfs(pdf_directory)
    
print("Splitting documents...")
split_docs = split_documents(documents)
    
print("Enriching documents with course information...")
enriched_docs = enrich_documents(split_docs, course_info)
    
print("Indexing documents...")
vectorstore = index_documents(enriched_docs)
    
print(f"Indexed {len(enriched_docs)} document chunks.")
print(f"Vector store persisted at ./ksu_catalog_db")