In [20]:
import json, re
import requests
from bs4 import BeautifulSoup

school_ids={"UCLA":"U2Nob29sLTEwNzU=","UCB":"U2Nob29sLTEwNzI=","UCSD":"U2Nob29sLTEwNzk=",
            "UCI":"U2Nob29sLTEwNzQ=","UCSB":"U2Nob29sLTEwNzc=","UCSC":"U2Nob29sLTEwNzg=",
            "UCD":"U2Nob29sLTEwNzM=","UCR":"U2Nob29sLTEwNzY="}
school_link_ids={"UCLA":"1075","UCB":"1072","UCI":"1074","UCSD":"1079","UCSB":"1077","UCD":"1073","UCSC":"1078","UCR":"1076"}
schools=["UCLA","UCB","UCI","UCSD","UCR","UCSC","UCSB","UCD"]

In [21]:
def get_professors(schools, school_ids,school_link_ids):
    all_teachers = []
    for school in schools:
        reference_link="https://www.ratemyprofessors.com/search/professors/"+school_link_ids[school]+"?q=*"
        school_id=school_ids[school]
        ENDPOINT = "https://www.ratemyprofessors.com/graphql"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Origin": "https://www.ratemyprofessors.com",
            "Referer": reference_link,
            # Found in the page HTML: REACT_APP_GRAPHQL_AUTH = "dGVzdDp0ZXN0"
            "Authorization": "Basic dGVzdDp0ZXN0",
        }
        query = """
        query TeacherSearch($schoolID: ID!, $text: String!, $cursor: String) {
        newSearch {
            teachers(
            after: $cursor
            first: 20
            query: { text: $text, schoolID: $schoolID, fallback: true }
            ) {
            edges {
                node {
                legacyId
                firstName
                lastName
                department
                avgRating
                numRatings
                wouldTakeAgainPercent
                avgDifficulty
                school {
                    name
                }
                }
            }
            pageInfo {
                hasNextPage
                endCursor
            }
            resultCount
            }
        }
        }
        """
        text = ""
        cursor = None

        while True:
            payload = {
                "query": query,
                "variables": {"schoolID": school_id, "text": text, "cursor": cursor},
            }
            r = requests.post(ENDPOINT, json=payload, headers=headers, timeout=20)
            r.raise_for_status()
            data = r.json()

            teachers = data["data"]["newSearch"]["teachers"]
            for edge in teachers["edges"]:
                node = edge["node"]
                school_obj = node.get("school")
                if isinstance(school_obj, dict):
                    node["school"] = school_obj.get("name", "")
                else:
                    node["school"] = school_obj or ""
                all_teachers.append(node)

            if not teachers["pageInfo"]["hasNextPage"]:
                break
            cursor = teachers["pageInfo"]["endCursor"]
        print(f"done with {school}")
    return all_teachers


In [24]:
import pandas as pd
all_teachers=get_professors(schools,school_ids,school_link_ids)




done with UCLA
done with UCB
done with UCI
done with UCSD
done with UCR
done with UCSC
done with UCSB
done with UCD


In [25]:
df = pd.DataFrame(all_teachers)
cols = [
    "legacyId",
    "firstName",
    "lastName",
    "school",
    "department",
    "avgRating",
    "numRatings",
    "wouldTakeAgainPercent",
    "avgDifficulty",
]
df = df[cols]

df


Unnamed: 0,legacyId,firstName,lastName,school,department,avgRating,numRatings,wouldTakeAgainPercent,avgDifficulty
0,3145448,Sihao,Ma,University of California Los Angeles (UCLA),Mathematics,1.5,28,17.8571,4.8
1,675576,Robert,Fink,University of California Los Angeles (UCLA),Music,3.4,4,50.0000,3.5
2,417227,Arthur,Little Jr.,University of California Los Angeles (UCLA),English,4.2,19,33.3333,3.8
3,2626439,Saba,Aliyari,University of California Los Angeles (UCLA),Biology,2.1,12,33.3333,4.3
4,2873085,Megan,Stephan,University of California Los Angeles (UCLA),English,3.7,3,66.6667,3.3
...,...,...,...,...,...,...,...,...,...
28958,247345,Craig,McDonald,University of California Davis,Medicine,0.0,0,-1.0000,0.0
28959,228844,Scott,Schonfeldt-Aultman,University of California Davis,Ethnic Studies,0.0,0,-1.0000,0.0
28960,214672,Te,Williams,University of California Davis,Agriculture,0.0,0,-1.0000,0.0
28961,425544,Wen,Zhong,University of California Davis,Design,0.0,0,-1.0000,0.0


In [27]:
df["school"].describe()

count                              28963
unique                                 8
top       University of California Davis
freq                                5321
Name: school, dtype: object

In [26]:
df.to_csv("all_professors_rmp.csv", index=False)
