# Survivor Transcript Scraping
ultimately want to scrap all transcripts and use BERTopic modelling and NER to find interesting topics/people trends in each season and compare across all seasons to see how the game has changed ^.^

In [None]:
! pip install -r requirements.txt

In [2]:
# import required packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np
from itertools import zip_longest
from tqdm import tqdm
from IPython.display import clear_output
import time

### Quick test on object extraction

In [23]:
#open html
with open("SurvivorTranscripts.html", encoding = 'utf-8') as fp:
    soup = BeautifulSoup(fp, 'html.parser')

#extract all div tags with the specific class we want
seasons = soup.find_all('span', class_ = 'c4')
episode_names = soup.find_all('a',class_='c3')

# cycle through and generate df with season name, episode and href link
print(len(seasons), "seaons \n",seasons[0].text)

print(len(episode_names), "episodes")
print(episode_names[0].text, "\n", episode_names[0]['href'], )



40 seaons 
 Survivor: Borneo
595 episodes
Episode 1 - The Marooning 
 https://www.google.com/url?q=https://reality-tv-transcripts.fandom.com/wiki/The_Marooning&sa=D&source=editors&ust=1723031770556414&usg=AOvVaw3kUQJZzw72wkydoZD2-llX


In [74]:
# create table to store seasons, episodes, and links to each transcript page
# the links column will then be used to scrape the transcript

temp_list = []

# these are the 2 columns of the doc that contain all of the seasons and episode names
# they need to be combined interleaved to maintain order of seasons
list_blocks_c7 = soup.find_all('td', class_ = 'c7')
list_blocks_c5 = soup.find_all('td', class_ = 'c5')
# combine using zip_longest
combined_list_blocks = [item for pair in zip_longest(list_blocks_c7, list_blocks_c5) for item in pair]

# loop through each block (season episode list) and get season, episode, and link
for block_no in combined_list_blocks:
    season_name = block_no.find_all('span', class_ = 'c4')[0].text
    episode_names = block_no.find_all('a',class_='c3')
    #print( block_no.find_all('span', class_ = 'c4')[0].text)

    for ep in episode_names:
        #print(ep.text, "\n", ep['href'], )


        temp_list.append([
            block_no.find_all('span', class_ = 'c4')[0].text, # season name
            combined_list_blocks.index(block_no) + 1, # season number
            ep.text, # episode title
            ep['href'] # ep link
        ])

df = pd.DataFrame(temp_list, columns = ['Season', 'Season Num', 'Episode', 'URL'])
display(df.head())

#df.to_csv('episode_list.csv', header = True)

Unnamed: 0,Season,Season Num,Episode,URL
0,Survivor: Borneo,1,Episode 1 - The Marooning,https://www.google.com/url?q=https://reality-t...
1,Survivor: Borneo,1,Episode 2 - The Generation Gap,https://www.google.com/url?q=https://reality-t...
2,Survivor: Borneo,1,Episode 3 - Quest for Food,https://www.google.com/url?q=https://reality-t...
3,Survivor: Borneo,1,"Episode 4 - Too Little, Too Late?",https://www.google.com/url?q=https://reality-t...
4,Survivor: Borneo,1,Episode 5 - Pulling Your Own Weight,https://www.google.com/url?q=https://reality-t...


In [91]:
# Using links in table, cycle through and scrape transcript

## TEST WITH S1:E1
# set url
s1e1 = df.loc[0, 'URL']
response = requests.get(s1e1)
print('Visited URL: {}'.format(response.url))
print(response.status_code)

#parse HTML doc with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)


actual_link = soup.find_all('a')[0]['href']
response = requests.get(actual_link)
print('Visited URL: {}'.format(response.url))
print(response.status_code)

#parse HTML doc with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

td = soup.find_all("td")

print(td[0].text)

Visited URL: https://www.google.com/url?q=https://reality-tv-transcripts.fandom.com/wiki/The_Marooning&sa=D&source=editors&ust=1723031770556414&usg=AOvVaw3kUQJZzw72wkydoZD2-llX
200
<html lang="en-AU"><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>Redirect Notice</title><style>body,div,a{font-family:arial,sans-serif}body{background-color:#fff;margin-top:3px}div{color:#000}a:link{color:#4b11a8;}a:visited{color:#4b11a8;}a:active{color:#ea4335}div.mymGo{border-top:1px solid #dadce0;border-bottom:1px solid #dadce0;background:#f8f9fa;margin-top:1em;width:100%}div.aXgaGb{padding:0.5em 0;margin-left:10px}div.fTk7vd{margin-left:35px;margin-top:35px}</style></head><body><div class="mymGo"><div class="aXgaGb"><font style="font-size:larger"><b>Redirect Notice</b></font></div></div><div class="fTk7vd"> The page you were on is trying to send you to <a href="https://reality-tv-transcripts.fandom.com/wiki/The_Marooning">https://reality-tv-transcripts.fandom.com/wiki/T

In [5]:
df = pd.read_csv('episode_list.csv').iloc[:, 1:]
df.head()

Unnamed: 0,Season,Season Num,Episode,URL
0,Survivor: Borneo,1,Episode 1 - The Marooning,https://www.google.com/url?q=https://reality-t...
1,Survivor: Borneo,1,Episode 2 - The Generation Gap,https://www.google.com/url?q=https://reality-t...
2,Survivor: Borneo,1,Episode 3 - Quest for Food,https://www.google.com/url?q=https://reality-t...
3,Survivor: Borneo,1,"Episode 4 - Too Little, Too Late?",https://www.google.com/url?q=https://reality-t...
4,Survivor: Borneo,1,Episode 5 - Pulling Your Own Weight,https://www.google.com/url?q=https://reality-t...


In [34]:
# Record the start time
start_time = time.time()

# empty list to store text so can be appended to episodes df
episode_transcript = []
# create loop
for index, row in tqdm(df.iterrows(), total=len(df)):

    # Clear the output of the cell so it does nopt accumulate
    clear_output(wait=True)

    redirect_url = row['URL']
    response_1 = requests.get(redirect_url)

    # get actual link from redirect text
    soup = BeautifulSoup(response_1.text, 'html.parser')

    actual_link = soup.find_all('a')[0]['href']
    response_2 = requests.get(actual_link)

    # get text from actual_link
    soup = BeautifulSoup(response_2.text, 'html.parser')
    # filter for td where transcript is stored
    td = soup.find_all("td")
    # get episode transcript text
    try: 
    # Try to append the primary option
        episode_transcript.append([index, row['Episode'], td[0].text, actual_link])
    except Exception as e:
        try:
            # If the first try fails, attempt to find the <p> element
            episode_transcript.append([index, row['Episode'], soup.find_all("p")[0].text, actual_link])
        except Exception:
            # If all else fails, append None
            episode_transcript.append([index, row['Episode'], None, actual_link])
    print(row["Season Num"],"-", row['Episode'], "First URL:", response_1.status_code, "Second URL:", response_2.status_code, index, len(episode_transcript))

# Record the end time
end_time = time.time()

# Calculate the duration
duration = end_time - start_time

# Print start, end, and duration
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
print(f"Duration: {duration:.2f} seconds, {duration/60:.2f} minutes")

episode_transcript_df = pd.DataFrame(episode_transcript, columns = ["original_index", "ep_name","episode_transcript", "actual_URL"])

transcript_df = pd.merge(df, episode_transcript_df, right_index=True, left_index=True, how = 'inner')

columns = ['Season', 'Season Num', 'Episode', 'URL', 'actual_URL', 'episode_transcript']
transcript_df[columns].to_csv('transcript_df.csv', header = True)
print("Final df with transcripts written to transcript_df.csv")

100%|██████████| 595/595 [06:45<00:00,  1.47it/s]

40 - Episode 14 - It All Boils Down to This First URL: 200 Second URL: 200 594 595
Start time: 2024-08-09 21:50:50
End time: 2024-08-09 21:57:36
Duration: 405.96 seconds, 6.77 minutes



