In [1]:

pip install selenium webdriver-manager


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/miniconda3/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
"""
Asynchronous scraper for collecting professor profile links from RateMyProfessors
using the platform’s GraphQL search interface.

This script issues paginated GraphQL queries to retrieve all faculty records
associated with a particular institution. Responses are streamed to a JSONL
file and also stored in a deduplicated JSON file. The implementation relies on
browser-captured cookies and headers to replicate the client configuration
required for successful authenticated requests.

The scraper uses direct HTTP/2 communication with httpx and avoids browser
automation. Pagination continues until the GraphQL API indicates that no
additional pages are available.
"""

import httpx
import json
import nest_asyncio
import asyncio
import logging
import os

nest_asyncio.apply()

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("rmp")

GRAPHQL_URL = "https://www.ratemyprofessors.com/graphql"
SCHOOL_ID_ENCODED = "U2Nob29sLTY5Ng=="  # Northeastern University (Base64-encoded)

STREAM_FILE = "neu_professor_links.jsonl"
DEDUP_FILE  = "neu_professor_links.json"

# ---------------------------------------------------------------------------
# Browser cookies exported directly from the authenticated session.
# Retaining these values allows the script to reproduce an authorized
# GraphQL request environment and prevents access denial responses.
# ---------------------------------------------------------------------------
cookies = {
    "RMP_AUTH_COOKIE_VERSION": "v02",
    "_ga_WET17VWCJ3": "GS2.1.s1762798707$o5$g1$t1762801537$j60$l0$h0",
    "_ga_E3NFCSBPE3": "GS2.1.s1762798707$o5$g1$t1762801537$j60$l0$h171711223",
    "_pubcid": "e1a42c6d-de6f-4c29-b358-5e824da29bf9",
    "_pubcid_cst": "zix7LPQsHA%3D%3D",
    "_hjSession_1667000": "eyJpZCI6IjVhMmI4ZDEyLWRkZjctNDgzNi05MTgzLWY3YzYwZGI1OTRmOSIsImMiOjE3NjI4MDE1MzgzMjYsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxfQ==",
    "_hjSessionUser_1667000": "eyJpZCI6IjI5MzQ4N2E4LTMwYWItNWU4YS05YmM1LWM0OTgxYTMzNTkzZiIsImNyZWF0ZWQiOjE3NjI4MDE1MzgzMjYsImV4aXN0aW5nIjp0cnVlfQ==",
    "AWSALB": "j4k9ox4ev2G+tHoas0g7w3z95JL4s3/AHEQ0SbIPO633m/f87eVCWCtG1Dfoa+5vneOk9/R612ZvBFHBZstIXS3Jy3G2t6wbp5cVO84uqk4SeZ0EWqenNmovCZg5",
    "AWSALBCORS": "j4k9ox4ev2G+tHoas0g7w3z95JL4s3/AHEQ0SbIPO633m/f87eVCWCtG1Dfoa+5vneOk9/R612ZvBFHBZstIXS3Jy3G2t6wbp5cVO84uqk4SeZ0EWqenNmovCZg5",
    "cto_bundle": "W5DZv19PVk9FMjVxQkwzNEJuWGY5WHpvSkFDb3M3U3o5UzNkZmZSQ1E4eWFqVk5MQ0pYeiUyQjE3UUw0VWs1MjVTVmFxb2VwR1Z4azRlMlYzaVhXaWZQYjlpaDJpVlQ2UFhPWDNIJTJGMmdVcjNzWVd6TFNZeE04UWVLWm1qbzRwVnBraTQ1SG1ES2RPJTJCN1JwTDVSckp0JTJGZUpPdENldmhXbzFRMkhJJTJGcyUyQlN6S3M2RUlSdUklM0Q",
    "cto_bidid": "uxwgIF9wZEFIUzg3emQlMkJmWHhUWjRBdiUyQmtVS1lORTBvMTdkT0RVT3dSU0F2OW15bXRQJTJCZDRTSklxaVFlMURuM2dyaVFqdXNjZEd0bHlpZ1FIbHM4WVl5MDJRMDNuMjVVem5uSlB4ZWNSak5XWkF3cmsxS2s3VlNteTYwNDdEMWsxM3hQUQ",
    "_awl": "2.1762801694.5-95a09d1678ded7d8fbefa227c6000bdd-6763652d75732d6561737431-0",
}

# ---------------------------------------------------------------------------
# Browser fingerprint headers required to reproduce the context in which the
# GraphQL call functions correctly. These values mirror actual network traffic
# captured through developer tools and are essential to prevent request denial.
# ---------------------------------------------------------------------------
headers = {
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9",
    "origin": "https://www.ratemyprofessors.com",
    "referer": "https://www.ratemyprofessors.com/search/professors/696?q=*",
    "sec-ch-ua": '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "content-type": "application/json",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
    "dnt": "1",
}

# ---------------------------------------------------------------------------
# GraphQL query used for loading paginated faculty search results. This
# definition is copied verbatim from browser developer tools to ensure that
# the server recognizes and accepts the request parameters.
# ---------------------------------------------------------------------------
QUERY = """query TeacherSearchPaginationQuery(
  $count: Int!
  $cursor: String
  $query: TeacherSearchQuery!
) {
  search: newSearch {
    teachers(query: $query, first: $count, after: $cursor) {
      didFallback
      edges {
        cursor
        node {
          id
          legacyId
          firstName
          lastName
          avgRating
          numRatings
          department
          __typename
        }
      }
      pageInfo {
        hasNextPage
        endCursor
      }
    }
  }
}"""


def make_vars(cursor=None):
    """
    Construct the variable dictionary for the GraphQL request.
    Pagination is controlled through the 'cursor' parameter, while
    'count' determines the number of items retrieved per page.
    """
    return {
        "count": 50,
        "cursor": cursor,
        "query": {
            "text": "",
            "schoolID": SCHOOL_ID_ENCODED,
            "fallback": True
        }
    }


# ---------------------------------------------------------------------------
# Core scraper: Issues paginated GraphQL queries, streams professor URLs
# to a JSONL file, and constructs a final deduplicated list.
# ---------------------------------------------------------------------------
async def scrape_professor_links():
    logger.info("Starting scrape...")

    if os.path.exists(STREAM_FILE):
        os.remove(STREAM_FILE)
    if os.path.exists(DEDUP_FILE):
        os.remove(DEDUP_FILE)

    results = []
    cursor = None
    page = 1

    async with httpx.AsyncClient(
        timeout=30.0,
        follow_redirects=True,
        headers=headers,
        cookies=cookies,
        http2=True
    ) as client:

        while True:
            logger.info(f"Fetching page {page}...")

            payload = {
                "query": QUERY,
                "operationName": "TeacherSearchPaginationQuery",
                "variables": make_vars(cursor)
            }

            r = await client.post(GRAPHQL_URL, json=payload)
            r.raise_for_status()

            data = r.json()
            teachers = (
                data.get("data", {})
                    .get("search", {})
                    .get("teachers", {})
                    .get("edges", [])
            )

            if not teachers:
                logger.info("No teachers returned — terminating pagination.")
                break

            with open(STREAM_FILE, "a") as f:
                for t in teachers:
                    legacy_id = t["node"]["legacyId"]
                    url = f"https://www.ratemyprofessors.com/professor/{legacy_id}"
                    f.write(json.dumps({"url": url}) + "\n")
                    results.append(url)

            page_info = data["data"]["search"]["teachers"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break

            cursor = page_info["endCursor"]
            page += 1

    final = sorted(set(results))
    with open(DEDUP_FILE, "w") as f:
        json.dump(final, f, indent=2)

    logger.info(f"Streamed records written to: {STREAM_FILE}")
    logger.info(f"Deduplicated list saved to: {DEDUP_FILE}")
    logger.info(f"Total unique links: {len(final)}")

    return final


# Execute scraper
links = await scrape_professor_links()
links


2025-11-10 14:39:05,625 - INFO - Starting scrape...
2025-11-10 14:39:05,702 - INFO - Fetching page 1...
2025-11-10 14:39:05,982 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphql "HTTP/2 200 OK"
2025-11-10 14:39:05,985 - INFO - Fetching page 2...
2025-11-10 14:39:06,062 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphql "HTTP/2 200 OK"
2025-11-10 14:39:06,066 - INFO - Fetching page 3...
2025-11-10 14:39:06,144 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphql "HTTP/2 200 OK"
2025-11-10 14:39:06,149 - INFO - Fetching page 4...
2025-11-10 14:39:06,229 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphql "HTTP/2 200 OK"
2025-11-10 14:39:06,234 - INFO - Fetching page 5...
2025-11-10 14:39:06,405 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphql "HTTP/2 200 OK"
2025-11-10 14:39:06,408 - INFO - Fetching page 6...
2025-11-10 14:39:06,499 - INFO - HTTP Request: POST https://www.ratemyprofessors.com/graphq

['https://www.ratemyprofessors.com/professor/1003890',
 'https://www.ratemyprofessors.com/professor/1004624',
 'https://www.ratemyprofessors.com/professor/1007417',
 'https://www.ratemyprofessors.com/professor/1013248',
 'https://www.ratemyprofessors.com/professor/1013495',
 'https://www.ratemyprofessors.com/professor/1014242',
 'https://www.ratemyprofessors.com/professor/1015850',
 'https://www.ratemyprofessors.com/professor/1016693',
 'https://www.ratemyprofessors.com/professor/1017883',
 'https://www.ratemyprofessors.com/professor/1018442',
 'https://www.ratemyprofessors.com/professor/1019051',
 'https://www.ratemyprofessors.com/professor/1026151',
 'https://www.ratemyprofessors.com/professor/1026885',
 'https://www.ratemyprofessors.com/professor/1029676',
 'https://www.ratemyprofessors.com/professor/1034251',
 'https://www.ratemyprofessors.com/professor/1035835',
 'https://www.ratemyprofessors.com/professor/1036470',
 'https://www.ratemyprofessors.com/professor/1037943',
 'https://