In [3]:
from urllib.request import urlopen
import urllib
from urllib.request import Request
from bs4 import BeautifulSoup
import pandas as pd
from user_agent import generate_user_agent

url_1 = "https://www.imdb.com/title/tt0898266/episodes/?season={num}&ref_=ttep"
url_2 = "https://www.imdb.com"
epi_list = []

for i in range(1, 13):
    try:
        req = Request(url=url_1.format(num=i), headers={'User-Agent':generate_user_agent()})
        page = urlopen(req)
        soup = BeautifulSoup(page, "html.parser")
        
        episodes = soup.find(class_="sc-67c7a421-0 jsgvdx")
        if episodes is None:
            print(f"Season {i}: No episodes found")
            continue
            
        for idx, each in enumerate(episodes):
            try:
                epi = []
                
                # 제목과 에피소드 정보 가져오기
                title_element = each.find(class_="ipc-title__text")
                if title_element:
                    full_title = title_element.get_text().split(" ∙ ")
                    sea_epi = full_title[0].split(".")
                    season = int(sea_epi[0].replace('S', ''))
                    episode = int(sea_epi[1].replace('E', ''))
                    title = full_title[1]
                else:
                    continue
                
                # 날짜 정보 가져오기
                date_element = each.find(class_="sc-f2169d65-10 bYaARM")
                date = date_element.get_text() if date_element else "No date"
                
                # 링크 정보 가져오기
                link_element = each.find(class_="ipc-lockup-overlay ipc-focusable")
                if link_element and 'href' in link_element.attrs:
                    link = link_element['href']
                    url_link = url_2 + link
                    
                    # 상세 페이지 접근
                    req_link = Request(url=url_link, headers={'User-Agent':generate_user_agent()})
                    page_link = urlopen(req_link)
                    soup_link = BeautifulSoup(page_link, "html.parser")
                    
                    # 장르 및 감독, 작가, 배우 정보 가져오기
                    genre_staff_element = soup_link.find(class_="sc-70a366cc-4 jrGDsj")
                    if genre_staff_element:
                        genre_tags = genre_staff_element.find_all("span", "ipc-chip__text")
                        genre = ", ".join([tag.get_text() for tag in genre_tags]) if genre_tags else "No genre"

                        staff_tags = genre_staff_element.find_all(
                            "a", "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
                        staff = [tag.get_text() for tag in staff_tags] if staff_tags else "No staff"

                        director = staff[0]
                        writers = staff[1:4]
                        stars = staff[4:]
                    else:
                        genre = "No genre"
                        staff = "No staff"
                    
                    # 내용 정보 가져오기
                    content_element = soup_link.find(class_="ipc-html-content-inner-div")
                    content = content_element.get_text() if content_element else "No content"
                else:
                    continue
                
                # 데이터 추가
                epi.extend([season, episode, title, date, genre, director, writers, stars, content])
                epi_list.append(epi)
                
            except Exception as e:
                print(f"Error processing episode {idx+1} in season {i}: {str(e)}")
                continue
                
    except Exception as e:
        print(f"Error processing season {i}: {str(e)}")
        continue

epi_df = pd.DataFrame(epi_list, 
                      columns=["season", "episode", "title", "date", "genre", "director", "writers", "stars", "content"])
epi_df.head()

Unnamed: 0,season,episode,title,date,genre,director,writers,stars,content
0,1,1,Pilot,"Mon, Sep 24, 2007","Comedy, Romance",James Burrows,"[Chuck Lorre, Bill Prady, Johnny Galecki]","[Jim Parsons, Kaley Cuoco]",Leonard (Johnny Galecki) states that the combi...
1,1,2,The Big Bran Hypothesis,"Mon, Oct 1, 2007","Comedy, Romance",Mark Cendrowski,"[Chuck Lorre, Bill Prady, Robert Cohen]","[Johnny Galecki, Jim Parsons, Kaley Cuoco]",This is the only time where a different camera...
2,1,3,The Fuzzy Boots Corollary,"Mon, Oct 8, 2007","Comedy, Romance",Mark Cendrowski,"[Chuck Lorre, Bill Prady, Steven Molaro]","[Johnny Galecki, Jim Parsons, Kaley Cuoco]",The song that Leonard (Johnny Galecki) is sing...
3,1,4,The Luminous Fish Effect,"Mon, Oct 15, 2007","Comedy, Romance",Mark Cendrowski,"[Chuck Lorre, Bill Prady, David Litt]","[Johnny Galecki, Jim Parsons, Kaley Cuoco]",When Mary Cooper (Laurie Metcalf) introduces h...
4,1,5,The Hamburger Postulate,"Mon, Oct 22, 2007","Comedy, Romance",Andrew D. Weyman,"[Chuck Lorre, Bill Prady, David Goetsch]","[Johnny Galecki, Jim Parsons, Kaley Cuoco]",Johnny Galecki (Leonard) knows how to play the...


In [10]:
# 리스트를 문자열로 변환하는 함수
def clean_list_to_string(x):
    if isinstance(x, list):
        return ", ".join(x)  # 리스트의 요소들을 쉼표로 구분하여 문자열로 변환
    return x

# writers와 stars 컬럼에 함수 적용
epi_df['writers'] = epi_df['writers'].apply(clean_list_to_string)
epi_df['stars'] = epi_df['stars'].apply(clean_list_to_string)

# 인덱스 설정
#epi_df.set_index(["season", "episode"], inplace=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(epi_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,title,date,genre,director,writers,stars,content
season,episode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,Pilot,"Mon, Sep 24, 2007","Comedy, Romance",James Burrows,"Chuck Lorre, Bill Prady, Johnny Galecki","Jim Parsons, Kaley Cuoco",Leonard (Johnny Galecki) states that the combined IQ of Sheldon (Jim Parsons) and himself is 360. In a later episode of season 1 Sheldon states his IQ as 187 meaning that Leonard's IQ is 173.
1,2,The Big Bran Hypothesis,"Mon, Oct 1, 2007","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, Robert Cohen","Johnny Galecki, Jim Parsons, Kaley Cuoco","This is the only time where a different camera angle for the staircase is seen, when they are trying to push up Penny's furniture."
1,3,The Fuzzy Boots Corollary,"Mon, Oct 8, 2007","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, Steven Molaro","Johnny Galecki, Jim Parsons, Kaley Cuoco","The song that Leonard (Johnny Galecki) is singing is ""Boston"" by Augustana."
1,4,The Luminous Fish Effect,"Mon, Oct 15, 2007","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, David Litt","Johnny Galecki, Jim Parsons, Kaley Cuoco","When Mary Cooper (Laurie Metcalf) introduces herself as ""Sheldon's mother"" to Dr. Eric Gablehauser (Mark Harelik), Sheldon's boss, he says that's impossible and that she must've had Sheldon when she was a teenager. Laurie Metcalf is 17 years older than Jim Parsons."
1,5,The Hamburger Postulate,"Mon, Oct 22, 2007","Comedy, Romance",Andrew D. Weyman,"Chuck Lorre, Bill Prady, David Goetsch","Johnny Galecki, Jim Parsons, Kaley Cuoco",Johnny Galecki (Leonard) knows how to play the cello in real life.
1,6,The Middle Earth Paradigm,"Mon, Oct 29, 2007","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, David Litt","Johnny Galecki, Jim Parsons, Kaley Cuoco",It is revealed that Leonard (Johnny Galecki)'s middle name is Leakey. Leonard's father once worked with anthropologist Louis Leakey.
1,7,The Dumpling Paradox,"Mon, Nov 5, 2007","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, Lee Aronsohn","Johnny Galecki, Jim Parsons, Kaley Cuoco","As Chen (the waiter at the Chinese restaurant) leaves Sheldon (Jim Parsons), Leonard (Johnny Galecki) and Raj (Kunal Nayyar)'s table, he mutters ""Young idiot"" under his breath in Cantonese."
1,8,The Grasshopper Experiment,"Mon, Nov 12, 2007","Comedy, Romance",Ted Wass,"Chuck Lorre, Bill Prady, Lee Aronsohn","Johnny Galecki, Jim Parsons, Kaley Cuoco","This episode was the final one to air before the 2007-2008 Writers Guild of America strike, which put a hiatus on television production for three to four months."
1,9,The Cooper-Hofstadter Polarization,"Mon, Mar 17, 2008","Comedy, Romance",Joel Murray,"Chuck Lorre, Bill Prady, Lee Aronsohn","Johnny Galecki, Jim Parsons, Kaley Cuoco","The 30 second video shot by Howard ""Physicists Gone Wild!"" is a real video which is still available in YouTube."
1,10,The Loobenfeld Decay,"Mon, Mar 24, 2008","Comedy, Romance",Mark Cendrowski,"Chuck Lorre, Bill Prady, Lee Aronsohn","Johnny Galecki, Jim Parsons, Kaley Cuoco","This episode starts Sheldon's habit of repeatedly knocking on a door while repeatedly calling out to the person, although it is a series of four knocks instead of three."


In [11]:
epi_df.to_csv("../data/Big_Bang_Theory.csv")