In [4]:
# import libraries
import requests
from bs4 import BeautifulSoup
import time

In [5]:
# set up urls
base_url = "https://www.seinfeldscripts.com/"
main_url = f"{base_url}seinfeld-scripts.html"

In [6]:
# set header to look more legit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}

In [7]:
# request the main page

response = requests.get(main_url, headers=headers) # sends get request to obtain main page content

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
        # response.text gets HTML content from main page as string
        # parse HTML from response.text with html.parser
else:
    print("Failed to retrieve the page. Status code:", response.status_code)

In [8]:
# find all links to epsode pages
episode_links = soup.select("td a")
cleaned_links = []
for link in episode_links:
    new_link = link['href'].strip()
    cleaned_links.append(new_link)

In [9]:
i = 0
for link in cleaned_links:
    print(i)
    print(link)
    i += 1

0
TheSeinfeldChronicles.htm
1
TheStakeout.htm
2
TheRobbery.htm
3
MaleUnbonding.htm
4
TheStockTip.htm
5
TheExGirlfriend.htm
6
ThePonyRemark.htm
7
TheJacket.htm
8
ThePhoneMessage.htm
9
TheApartment.htm
10
TheStatue.htm
11
TheRevenge.htm
12
TheHeartAttack.htm
13
TheDeal.htm
14
TheBabyShower.htm
15
TheChineseRestaurant.htm
16
TheBusboy.htm
17
TheNote.html
18
TheTruth.htm
19
ThePen.html
20
TheDog.htm
21
TheLibrary.htm
22
TheParkingGarage.htm
23
TheCafe.html
24
TheTape.htm
25
TheNoseJob.html
26
TheStranded.html
27
TheAlternateSide.htm
28
TheRedDot.htm
29
TheSubway.htm
30
ThePezDispenser.htm
31
TheSuicide.html
32
TheFixUp.html
33
TheBoyfriend1.htm
34
TheBoyfriend2.htm
35
TheLimo.html
36
TheGoodSamaritan.html
37
TheLetter.htm
38
TheParkingSpace.html
39
TheKeys.html
40
TheTrip1.htm
41
TheTrip2.html
42
ThePitch.htm
43
TheTicket.html
44
TheWallet.html
45
TheWatch.html
46
TheBubbleBoy.htm
47
TheCheeverLetters.htm
48
TheOpera.html
49
TheVirgin.htm
50
TheContest.htm
51
TheAirport.htm
52
ThePick.htm


In [10]:
cleaned_links.remove('Highlights-of-100-1.html')
cleaned_links.remove('Highlights-of-100-2.html')
cleaned_links.remove('The-Clip-Show-1.html')
cleaned_links.remove('The-Clip-Show-2.html')
cleaned_links.remove(cleaned_links[-1])
cleaned_links.remove(cleaned_links[-1])

In [11]:
import re

# create bins
all_titles = []
all_scripts = [] # object to hold each episode's script

# set start and end keywords
start_keywords = ["[scene", "Opening Monologue", "[setting", "levitan:", "every time somebody", "stand up", "act one", "at the comedy club", "monk's caf", "[jerry", "do you think that the people", "(jerry is driving alone", "act 1", "JERRY'S APARTMENT", "[New York State", "[Montage", "Jerry and George talking near a bar.", "Jerry I've always been a big", "[Night", "(the bathroom)", "[At club", "Monologue", "Jerry's stand-up", "[location:", "Mr. Tuttle", "[George's", "[Elaine", "GEORGE:", "ELAINE: [thinking]", "George in a meeting", "Opening scene", "Jerry and", "Elaine at", "high LS following", "[Monks", "[George", "Exterior of", "(Scene:", "The Costanzas are driving", "Jerry, on the", "(The final", "Jerry, [tapping", "Waitress giving", "Some street , Jerry", "George, drinking", "First scene.", "INT. "]
end_keywords = ["<Spell", "end of show", "episodes overview", "[end]"]

# set ad phrase to avoid
ad = "Looking for a great gift idea for the holidays? Check out our complete Seinfeld Gift Guide right now! Including T-Shirts, DVDs, and more!"

# avoid errors from certain unfinished transcripts
okay_links = ['https://www.seinfeldscripts.com/TheWaitOut.htm', 'https://www.seinfeldscripts.com/TheSuzie.htm']

# loop through episodes for scripts
for link in cleaned_links:

    # link setup
    episode_url = base_url + link # construct full link, strip space from href
    print(f"retrieving from: {episode_url}")

    # create soup
    episode_response = requests.get(episode_url, headers=headers)
    episode_soup = BeautifulSoup(episode_response.text, "html.parser")

    # extract title
    episode_title = episode_soup.find("h1").get_text(strip=True)
    print(f"episode title: {episode_title}")

    # extract script
    script_content = ""                                                             # create script bin
    for p_tag in episode_soup.find_all("p"):                                        # find all p tags in html                                    
        if len(p_tag.text.strip()) > 0:                                             # Only include non-empty paragraphs
            dialogue_line = p_tag.get_text(separator="").strip().replace("\n", "")  # get text; clean leading and trailing spaces; remove all \n
            dialogue_line = re.sub(r'\s+', ' ', dialogue_line)                      # change multiple spaces to one space
            dialogue_line = re.sub(r'===+', '', dialogue_line)
            dialogue_line = re.sub(r'---+', '', dialogue_line)
            dialogue_line = re.sub(r'~~~+', '', dialogue_line)
            dialogue_line = dialogue_line.replace(ad, '')                           # get rid of ad line
            script_content += dialogue_line + "\n"                                  # add line followed by newline for readability

    script_length = len(script_content)

    # find starting point of actual dialogue (get rid of heading)
    start_incidence_list = []                                                       # create incidence bin (incidence means first keyword to signify start of real dialogue
    for keyword in start_keywords:                                          
        start_incidences = str.lower(script_content).find(str.lower(keyword))       # find the location of the first incidence of each keyword
        start_incidence_list.append(start_incidences)                               # add each location of keyword to a list

    # find ending point of actual dialogue (get rid of footer)
    end_incidence_list = []
    for keyword in end_keywords:
        end_incidences = str.lower(script_content).find(str.lower(keyword))
        end_incidence_list.append(end_incidences)

    # check incidence list to find start point
    try:
        min_incidence = min([num for num in start_incidence_list if num != -1])
        min_incidence_keyword = script_content[min_incidence:min_incidence + 10]
    # raise error if none of the keywords are found
    except ValueError:
        print("Error: Could not find any starting keywords.")
        raise TypeError
    
    # check incidence list to find end point
    try:
        max_incidence = min([num for num in end_incidence_list if num != -1])
        max_incidence_keyword = script_content[max_incidence:max_incidence + 10]
    # raise error if none of the keywords are found
    except ValueError:
        print("Error: Could not find any ending keywords.")
        raise TypeError

    # if both start and endpoint, start scraping (also print out some info for diagnostics)
    if (min_incidence != -1) and (max_incidence != -1):
        print(f"reading from character/word: {min_incidence_keyword} at position {min_incidence}")
        print(f"reading until character/word: {max_incidence_keyword} at position {max_incidence}")

        total_incidence = max_incidence - min_incidence
        print(f"total copied characters: {total_incidence}")

        script_content = script_content[min_incidence:max_incidence-1]

        if min_incidence > 2000 and episode_url not in okay_links:
            print("Error: Late starting point.")
            raise TypeError
        if total_incidence < 15000 and episode_url not in okay_links:
            print("Error: Low number of characters for episode.")
            raise TypeError
        
        
        #character_percent = (max_incidence - min_incidence) / script_length
        #print(f"percentage of characters copied: {round(character_percent*100, 2)}%")
        #if character_percent < 0.90:
        #    print("Error: Low percentage of total characters copied.")
        #    raise TypeError

    # avoid overloading the server (be polite)
    time.sleep(1) # wait 1 second between requests

    # add title and script to bins
    all_titles.append(episode_title)
    all_scripts.append(script_content)

    print()

retrieving from: https://www.seinfeldscripts.com/TheSeinfeldChronicles.htm
episode title: The Seinfeld Chronicles
reading from character/word: [Scene: Co at position 1
reading until character/word: <Spell che at position 23321
total copied characters: 23320

retrieving from: https://www.seinfeldscripts.com/TheStakeout.htm
episode title: The Stakeout
reading from character/word: Opening mo at position 1256
reading until character/word: [End]

<Sp at position 23362
total copied characters: 22106

retrieving from: https://www.seinfeldscripts.com/TheRobbery.htm
episode title: The Robbery
reading from character/word: [Scene: Co at position 1
reading until character/word: <Spell che at position 22336
total copied characters: 22335

retrieving from: https://www.seinfeldscripts.com/MaleUnbonding.htm
episode title: Male Unbonding
reading from character/word: [Setting:  at position 222
reading until character/word: END OF SHO at position 20559
total copied characters: 20337

retrieving from: htt

In [12]:
with open("Seinfeld_full_scripts.txt", "w", encoding="utf-8") as file: # w is for write mode, encoding is for cross-platform compatibility
    i = 0
    for script in all_scripts:
        file.write(all_titles[i] + "\n\n")
        file.write(script + "\n\n") # add two lines between episode scripts
        i += 1

In [13]:
with open("seinfeld_full_scripts.txt", "r") as file:
    script = file.readlines()

# Loop through each line and uppercase character names
threshold = 1024
standardized_script = []
for line in script:
    if ':' in line:
        # Split by the first colon to separate the character name from dialogue
        character, dialogue = line.split(':', 1)
        # Capitalize the character name and reassemble the line
        line = f"{character.upper()}:{dialogue}"
    while len(line) > threshold:
        standardized_script.append(line[:threshold] + '\n')
        line = line[threshold:]
    standardized_script.append(line)

# Join all lines back into a single script
standardized_script = ''.join(standardized_script)

# Save or print the result
with open("standardized_seinfeld_script.txt", "w") as file:
    file.write(standardized_script)