In [1]:
import pandas as pd
import requests

df=pd.read_csv("all_professors_rmp.csv")

In [2]:
df.size

260667

In [3]:
teachers_id=df["legacyId"].tolist()

In [4]:
import time
import requests

def post_with_retry(url, payload, headers, max_retries=5):
    delay = 1
    for attempt in range(max_retries):
        try:
            r = requests.post(url, json=payload, headers=headers, timeout=20)
            if r.status_code == 429:
                time.sleep(delay)
                delay *= 2
                continue
            r.raise_for_status()
            return r
        except requests.exceptions.ConnectionError as e:
            # DNS/network issue
            time.sleep(delay)
            delay *= 2
            if attempt == max_retries - 1:
                raise


def legacy_to_relay(legacy_id):
    import base64
    return base64.b64encode(f"Teacher-{legacy_id}".encode()).decode()

def load_data(teacher_id):
    import re

    def clean_comment(text):
        if text is None:
            return ""
        text = str(text)
        text = text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
        text = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    referer = f"https://www.ratemyprofessors.com/professor/{teacher_id}"
    ENDPOINT = "https://www.ratemyprofessors.com/graphql"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Origin": "https://www.ratemyprofessors.com",
        "Referer": referer,
        "Authorization": "Basic dGVzdDp0ZXN0",
    }

    query = """
    query RatingsList($id: ID!, $cursor: String) {
      node(id: $id) {
        ... on Teacher {
          ratings(first: 20, after: $cursor) {
            edges {
              node {
                clarityRating
                class
                comment
                date
                difficultyRating
                legacyId
                ratingTags
              }
            }
            pageInfo {
              hasNextPage
              endCursor
            }
          }
        }
      }
    }
    """

    teacher_global_id = legacy_to_relay(teacher_id)

    all_reviews = []
    cursor = None

    while True:
      payload = {
          "query": query,
          "variables": {"id": teacher_global_id, "cursor": cursor},
      }
      r = post_with_retry(ENDPOINT, payload, headers)
      data = r.json()

      reviews = data["data"]["node"]["ratings"]
      for edge in reviews["edges"]:
          node = edge["node"]
          node["comment"] = clean_comment(node.get("comment"))
          all_reviews.append(node)

      if not reviews["pageInfo"]["hasNextPage"]:
          break
      cursor = reviews["pageInfo"]["endCursor"]


    return all_reviews


In [5]:
rows = []
for id in teachers_id:
    reviews = load_data(id)
    for r in reviews:
        row = dict(r)
        row["profId"] = id
        rows.append(row)

df = pd.DataFrame(rows)



In [6]:
df.to_csv("rmp_all_schools_reviews_small.csv",index=False)

In [7]:
df.size

3863496

In [8]:
df.head()

Unnamed: 0,clarityRating,class,comment,date,difficultyRating,legacyId,ratingTags,profId
0,1,Math32A,This was the absolute worst math class of my l...,2026-02-10 04:20:05 +0000 UTC,5,42598787,Tough grader--Test heavy,3145448
1,1,Math32A,If I can save you from the torture of a lifeti...,2026-02-10 01:35:28 +0000 UTC,5,42598200,Tough grader,3145448
2,1,Math32A,Get ready for the biggest GPA tank of ur life....,2026-02-02 03:51:34 +0000 UTC,5,42573402,Tough grader--Test heavy--Graded by few things,3145448
3,1,Math32A,3 midterms with 20% each and a final worth 40%...,2026-01-28 04:24:59 +0000 UTC,4,42557851,Tough grader--Test heavy--Graded by few things,3145448
4,1,Math32A,His a good person since he is willing to help ...,2026-01-28 04:22:44 +0000 UTC,5,42557841,Tough grader--Test heavy,3145448


In [None]:
df_small=df.head