In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

In [17]:
pip install openpyxl




In [21]:
pip install xlsxWriter




---

# 1. 파일 읽기

---

In [2]:
# CSV 파일 읽기
file_path = '../data/justwatch_second_1090.xlsx'
df = pd.read_excel(file_path)

# 열 이름 확인
print(df.columns)

Index(['Unnamed: 0', 'title', 'original_title', 'year', 'season_episode',
       'runtime', 'genre', 'age_rating', 'Production country', 'IMDb_title',
       'IMDb_URL'],
      dtype='object')


In [3]:
# URL 리스트 추출 (상위 5개만)
urls = df['IMDb_URL'].tolist()
urls

['https://www.imdb.com/title/tt10919420/',
 'https://www.imdb.com/title/tt7605396/',
 'https://www.imdb.com/title/tt12079212/',
 'https://www.imdb.com/title/tt12809988/',
 'https://www.imdb.com/title/tt11612120/',
 'https://www.imdb.com/title/tt17491088/',
 'https://www.imdb.com/title/tt6470478/',
 'https://www.imdb.com/title/tt1520211/',
 'https://www.imdb.com/title/tt1526318/',
 'https://www.imdb.com/title/tt3107288/',
 'https://www.imdb.com/title/tt7622902/',
 'https://www.imdb.com/title/tt8740790/',
 'https://www.imdb.com/title/tt1632701/',
 'https://www.imdb.com/title/tt5753856/',
 'https://www.imdb.com/title/tt3006802/',
 'https://www.imdb.com/title/tt0873992/',
 'https://www.imdb.com/title/tt13068006/',
 'https://www.imdb.com/title/tt0903747/',
 'https://www.imdb.com/title/tt8824648/',
 'https://www.imdb.com/title/tt2306299/',
 'https://www.imdb.com/title/tt22352854/',
 'https://www.imdb.com/title/tt5290382/',
 'https://www.imdb.com/title/tt2661044/',
 'https://www.imdb.com/titl

In [4]:
len(urls)

1090

In [5]:
df['title']

0                  Ojingeo Geim
1                      12인의 심판자
2                        희생자 게임
3            스위트 투스: 사슴뿔을 가진 소년
4                    Sweet Home
                 ...           
1085                         달러
1086                      러브 나우
1087                     탕탕 라이브
1088                       대송궁사
1089    엘리트들, 못다 한 이야기: 나디아 구스만
Name: title, Length: 1090, dtype: object

In [6]:
df['original_title']

0                                 Squid Game
1                                 The Twelve
2                                      誰是被害者
3                                Sweet Tooth
4                                        NaN
                        ...                 
1085                                  Dollar
1086                                   真愛趁現在
1087                                糖糖Online
1088                                    大宋宫词
1089    Elite Histórias Breves: Nadia Guzmán
Name: original_title, Length: 1090, dtype: object

---

# 2. 테스트 URL
- URL 하나를 임의로 저장한 뒤 페이지에서 정보 추출

In [7]:
import numpy as np

# 크롤링 결과를 저장할 리스트
data = []

# Selenium 웹 드라이버 설정
driver = webdriver.Chrome()

# 테스트 URL
test_url = 'https://www.imdb.com/title/tt26693803/fullcredits/?ref_=tt_ql_1'

try:
    # cast & crew 페이지로 이동
    driver.get(test_url)

    # 페이지가 로드될 때까지 기다림
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))

    # 페이지 소스 가져오기
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # 감독 정보 추출
    directors = []
    director_elements = soup.select('#director + .simpleCreditsTable a')
    for elem in director_elements:
        directors.append(elem.text.strip())

    # 작가 정보 추출
    writers = []
    writer_elements = soup.select('#writer + .simpleCreditsTable a')
    for elem in writer_elements:
        writers.append(elem.text.strip())

    # 배우 정보 추출 (10명까지)
    actors = []
    actor_elements = soup.select('table.cast_list tr .primary_photo + td a')
    for elem in actor_elements[:10]:
        actors.append(elem.text.strip())

    # 결과를 리스트에 추가
    data.append({
        'IMDb_URL': test_url,
        'title': 'The Walking Dead',  # 테스트를 위한 임시 타이틀
        'fullcredits_url': test_url,
        'director': directors if directors else np.nan,
        'writer': writers if writers else np.nan,
        'actor': actors if actors else np.nan
    })

except Exception as e:
    print(f"Failed to process URL {test_url}: {e}")
    data.append({
        'IMDb_URL': test_url,
        'title': 'The Walking Dead',  # 테스트를 위한 임시 타이틀
        'fullcredits_url': test_url,
        'director': np.nan,
        'writer': np.nan,
        'actor': np.nan
    })

# 드라이버 종료
driver.quit()

# 크롤링 결과 확인
for item in data:
    print(item)


{'IMDb_URL': 'https://www.imdb.com/title/tt26693803/fullcredits/?ref_=tt_ql_1', 'title': 'The Walking Dead', 'fullcredits_url': 'https://www.imdb.com/title/tt26693803/fullcredits/?ref_=tt_ql_1', 'director': nan, 'writer': nan, 'actor': ['Im Yoon-ah', 'Lee Jun-ho', 'Go Won-Hee', 'Kim Ga-eun', 'Ahn Se-ha', 'Sun-young Kim', 'Kong Ye-ji', 'Kim Young-ok', 'Kim Jae-Won', 'Son Byung-ho']}


In [67]:
data

[{'IMDb_URL': 'https://www.imdb.com/title/tt26693803/fullcredits/?ref_=tt_ql_1',
  'title': 'The Walking Dead',
  'fullcredits_url': 'https://www.imdb.com/title/tt26693803/fullcredits/?ref_=tt_ql_1',
  'director': 'Young Woo Suh, Andrew Millet, Simon Moseley',
  'writer': 'Ozzy Emery',
  'actor': 'Im Yoon-ah, Lee Jun-ho, Go Won-Hee, Kim Ga-eun, Ahn Se-ha, Sun-young Kim, Kong Ye-ji, Kim Young-ok, Kim Jae-Won, Son Byung-ho'}]

---

# 3. URL 5개를 랜덤으로 불러와서 데이터 추출 및 저장

In [8]:
# IMDb_URL 5개를 랜덤으로 추출
test_urls = df['IMDb_URL'].sample(5).tolist()

# 크롤링 결과를 저장할 리스트
data = []

# Selenium 웹 드라이버 설정
driver = webdriver.Chrome()

for url in test_urls:
    try:
        # cast & crew 페이지 URL 생성
        fullcredits_url = url + 'fullcredits/?ref_=tt_ql_1'
        
        # cast & crew 페이지로 이동
        driver.get(fullcredits_url)

        # 페이지가 로드될 때까지 기다림
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))

        # 페이지 소스 가져오기
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # 감독 정보 추출
        directors = []
        director_elements = soup.select('#director + .simpleCreditsTable a')
        for elem in director_elements:
            directors.append(elem.text.strip())

        # 작가 정보 추출
        writers = []
        writer_elements = soup.select('#writer + .simpleCreditsTable a')
        for elem in writer_elements:
            writers.append(elem.text.strip())

        # 배우 정보 추출 (10명까지)
        actors = []
        actor_elements = soup.select('table.cast_list tr .primary_photo + td a')
        for elem in actor_elements[:10]:
            actors.append(elem.text.strip())

        # 결과를 리스트에 추가
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0],  # 제목도 추가
            'fullcredits_url': fullcredits_url,
            'director': directors if directors else np.nan,
            'writer': writers if writers else np.nan,
            'actor': actors if actors else np.nan
        })
        
    except Exception as e:
        print(f"Failed to process URL {url}: {e}")
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0],  # 제목도 추가
            'fullcredits_url': fullcredits_url,
            'director': np.nan,
            'writer': np.nan,
            'actor': np.nan
        })

# 드라이버 종료
driver.quit()

# 크롤링 결과 확인
for item in data:
    print(item)


{'IMDb_URL': 'https://www.imdb.com/title/tt6692188/', 'title': '엘 차포: 터널 킹', 'fullcredits_url': 'https://www.imdb.com/title/tt6692188/fullcredits/?ref_=tt_ql_1', 'director': ['Ernesto Contreras', 'J.M Cravioto', 'Carlos Moreno', 'Daniel Vega Vidal', 'Diego Vega Vidal', 'Carlos Rincones'], 'writer': ['Silvana Aguirre', 'Silvana Aguirre', 'Esteban Orozco', 'Esteban Orozco', 'Diego Vega Vidal', 'Carlos Contreras', 'Josué Méndez'], 'actor': ['Marco de la O', 'Humberto Busto', 'Diego Vásquez', 'Juan Carlos Cruz', 'Alejandro Aguilar', 'Laura Osma', 'Héctor Holten', 'Iván Aragón', 'Wilmer Cadavid', 'Paco Rueda']}
{'IMDb_URL': 'https://www.imdb.com/title/tt7221388/', 'title': '코브라 카이', 'fullcredits_url': 'https://www.imdb.com/title/tt7221388/fullcredits/?ref_=tt_ql_1', 'director': ['Jon Hurwitz', 'Hayden Schlossberg', 'Josh Heald', 'Jennifer Celotta', 'Joel Novoa', 'Steven K. Tsuchida', 'Marielle Woods', 'Steve Pink', 'Michael Grossman', 'Tawnia McKiernan', 'Lin Oeding'], 'writer': ['Josh Heal

In [11]:
# 데이터프레임으로 변환
df_result = pd.DataFrame(data)
df_result

Unnamed: 0,IMDb_URL,title,fullcredits_url,director,writer,actor
0,https://www.imdb.com/title/tt6692188/,엘 차포: 터널 킹,https://www.imdb.com/title/tt6692188/fullcredi...,"[Ernesto Contreras, J.M Cravioto, Carlos Moren...","[Silvana Aguirre, Silvana Aguirre, Esteban Oro...","[Marco de la O, Humberto Busto, Diego Vásquez,..."
1,https://www.imdb.com/title/tt7221388/,코브라 카이,https://www.imdb.com/title/tt7221388/fullcredi...,"[Jon Hurwitz, Hayden Schlossberg, Josh Heald, ...","[Josh Heald, Josh Heald, Josh Heald, Josh Heal...","[Ralph Macchio, William Zabka, Courtney Hengge..."
2,https://www.imdb.com/title/tt11937816/,누가 사라를 죽였을까,https://www.imdb.com/title/tt11937816/fullcred...,"[David Ruiz, Bernardo De la Rosa Villarreal, C...","[José Ignacio Valenzuela, José Ignacio Valenzu...","[Manolo Cardona, Ginés García Millán, Carolina..."
3,https://www.imdb.com/title/tt21964626/,Entrapped,https://www.imdb.com/title/tt21964626/fullcred...,,,"[Ólafur Darri Ólafsson, Ilmur Kristjánsdóttir,..."
4,https://www.imdb.com/title/tt6560040/,깊은 숲에서,https://www.imdb.com/title/tt6560040/fullcredi...,[Julius Berg],"[Delinda Jacobs, Delinda Jacobs]","[Samuel Labarthe, Suzanne Clément, Alexia Barl..."


In [10]:
# 결과를 XLSX 파일로 저장
xlsx_path = '../data/example_imdb_director_actor.xlsx'
df_result.to_excel(xlsx_path, index=False)

---

# 4. IMDb_CREW&CAST 가져오기-(1차 시도)
- justwatch_second_1090.xlsx 파일에 URL컬럼이 NaN일 경우 읽지 못하는 에러가 발생

In [16]:
# 크롤링 결과를 저장할 리스트
data = []

# Selenium 웹 드라이버 설정
driver = webdriver.Chrome()

for url in urls:
    try:
        # cast & crew 페이지 URL 생성
        fullcredits_url = url + 'fullcredits/?ref_=tt_ql_1'
        
        # cast & crew 페이지로 이동
        driver.get(fullcredits_url)

        # 페이지가 로드될 때까지 기다림
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))

        # 페이지 소스 가져오기
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # 감독 정보 추출
        directors = []
        director_elements = soup.select('#director + .simpleCreditsTable a')
        for elem in director_elements:
            directors.append(elem.text.strip())

        # 작가 정보 추출
        writers = []
        writer_elements = soup.select('#writer + .simpleCreditsTable a')
        for elem in writer_elements:
            writers.append(elem.text.strip())

        # 배우 정보 추출 (10명까지)
        actors = []
        actor_elements = soup.select('table.cast_list tr .primary_photo + td a')
        for elem in actor_elements[:10]:
            actors.append(elem.text.strip())

        # 결과를 리스트에 추가
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0],  # 제목도 추가
            'fullcredits_url': fullcredits_url,
            'director': directors if directors else np.nan,
            'writer': writers if writers else np.nan,
            'actor': actors if actors else np.nan
        })
        
    except Exception as e:
        print(f"Failed to process URL {url}: {e}")
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0],  # 제목도 추가
            'fullcredits_url': fullcredits_url,
            'director': np.nan,
            'writer': np.nan,
            'actor': np.nan
        })

# 드라이버 종료
driver.quit()

Failed to process URL nan: unsupported operand type(s) for +: 'float' and 'str'


IndexError: index 0 is out of bounds for axis 0 with size 0

# 5. IMDb_CREW&CAST 가져오기-(메인코드)
- 4번에서 발생한 코드 오류 해결

In [7]:
import numpy as np

# 크롤링 결과를 저장할 리스트
data = []

# Selenium 웹 드라이버 설정
driver = webdriver.Chrome()

for url in urls:
    try:
        # cast & crew 페이지 URL 생성
        fullcredits_url = url + 'fullcredits/?ref_=tt_ql_1'
        
        # cast & crew 페이지로 이동
        driver.get(fullcredits_url)

        # 페이지가 로드될 때까지 기다림
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))

        # 페이지 소스 가져오기
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # 감독 정보 추출
        directors = []
        director_elements = soup.select('#director + .simpleCreditsTable a')
        for elem in director_elements:
            directors.append(elem.text.strip())

        # 작가 정보 추출
        writers = []
        writer_elements = soup.select('#writer + .simpleCreditsTable a')
        for elem in writer_elements:
            writers.append(elem.text.strip())

        # 배우 정보 추출 (10명까지)
        actors = []
        actor_elements = soup.select('table.cast_list tr .primary_photo + td a')
        for elem in actor_elements[:10]:
            actors.append(elem.text.strip())

        # 중복 제거 및 결과를 리스트에 추가
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0],  # 제목도 추가
            'fullcredits_url': fullcredits_url,
            'director': ', '.join(sorted(set(directors))) if directors else np.nan,
            'writer': ', '.join(sorted(set(writers))) if writers else np.nan,
            'actor': ', '.join(actors) if actors else np.nan
        })
        
    except Exception as e:
        print(f"Failed to process URL {url}: {e}")
        data.append({
            'IMDb_URL': url,
            'title': df.loc[df['IMDb_URL'] == url, 'title'].values[0] if not df.loc[df['IMDb_URL'] == url].empty else np.nan,  # 제목도 추가
            'fullcredits_url': np.nan,
            'director': np.nan,
            'writer': np.nan,
            'actor': np.nan
        })

# 드라이버 종료
driver.quit()


Failed to process URL nan: unsupported operand type(s) for +: 'float' and 'str'
Failed to process URL nan: unsupported operand type(s) for +: 'float' and 'str'
Failed to process URL nan: unsupported operand type(s) for +: 'float' and 'str'


---

# 6. 데이터 확인하기

In [8]:
data

[{'IMDb_URL': 'https://www.imdb.com/title/tt10919420/',
  'title': 'Ojingeo Geim',
  'fullcredits_url': 'https://www.imdb.com/title/tt10919420/fullcredits/?ref_=tt_ql_1',
  'director': 'Hwang Dong-hyuk',
  'writer': 'Hwang Dong-hyuk',
  'actor': 'Lee Jung-jae, Park Hae-soo, Yasushi Iwaki, Hoyeon, Oh Yeong-su, Wi Ha-joon, Heo Sung-tae, Anupam Tripathi, Greg Chun, Stephen Fu'},
 {'IMDb_URL': 'https://www.imdb.com/title/tt7605396/',
  'title': '12인의 심판자',
  'fullcredits_url': 'https://www.imdb.com/title/tt7605396/fullcredits/?ref_=tt_ql_1',
  'director': 'Kaat Beels, Wouter Bouvijn',
  'writer': 'Bert Van Dael, Nele Meirhaeghe, Roel Mondelaers, Sanne Nuyens',
  'actor': 'Luc De Ruelle, Maaike Neuville, Tom Vermeir, Peter Gorissen, Zouzou Ben Chikha, Piet De Praitere, Maaike Cafmeyer, Josse De Pauw, Sofie Decleir, Mieke De Groote'},
 {'IMDb_URL': 'https://www.imdb.com/title/tt12079212/',
  'title': '희생자 게임',
  'fullcredits_url': 'https://www.imdb.com/title/tt12079212/fullcredits/?ref_=tt_q

In [9]:
len(data)

1090

In [10]:
# 데이터프레임으로 변환
df_result = pd.DataFrame(data)
df_result

Unnamed: 0,IMDb_URL,title,fullcredits_url,director,writer,actor
0,https://www.imdb.com/title/tt10919420/,Ojingeo Geim,https://www.imdb.com/title/tt10919420/fullcred...,Hwang Dong-hyuk,Hwang Dong-hyuk,"Lee Jung-jae, Park Hae-soo, Yasushi Iwaki, Hoy..."
1,https://www.imdb.com/title/tt7605396/,12인의 심판자,https://www.imdb.com/title/tt7605396/fullcredi...,"Kaat Beels, Wouter Bouvijn","Bert Van Dael, Nele Meirhaeghe, Roel Mondelaer...","Luc De Ruelle, Maaike Neuville, Tom Vermeir, P..."
2,https://www.imdb.com/title/tt12079212/,희생자 게임,https://www.imdb.com/title/tt12079212/fullcred...,"David Chuang, Kuan-Chung Chen","Joyce Liu, Jui-Liang Hsu, Shih-Keng Chien, Shu...","Wei-Ning Hsu, Hsiao-chuan Chang, Shih-Sian Wan..."
3,https://www.imdb.com/title/tt12809988/,스위트 투스: 사슴뿔을 가진 소년,https://www.imdb.com/title/tt12809988/fullcred...,"Alexis Ostrander, Carol Banker, Ciarán Foy, Ji...","Beth Schwartz, Bo Yeon Kim, Carly Woodworth, C...","Nonso Anozie, Christian Convery, Stefania LaVi..."
4,https://www.imdb.com/title/tt11612120/,Sweet Home,https://www.imdb.com/title/tt11612120/fullcred...,"Jang Young-woo, Lee Eung-bok, Park So-hyun","Hong So-ri, Hwang Young-Chan, Kim Hyung-min, K...","Song Kang, Lee Jin-wook, Lee Si-young, Park Gy..."
...,...,...,...,...,...,...
1085,https://www.imdb.com/title/tt10687564/,달러,https://www.imdb.com/title/tt10687564/fullcred...,Samer Al Barkawi,Hisham Hilal,"Mark Lewis, Adel Karam, Amel Bouchoucha, Jarro..."
1086,https://www.imdb.com/title/tt6273116/,러브 나우,https://www.imdb.com/title/tt6273116/fullcredi...,Charley Lien,"Hsiang-Min Huang, Hui-Ting Shao, Pei-Yu Lin, Y...","Annie Chen, George Hu, Chih-kung Tou, Vivi Lee..."
1087,https://www.imdb.com/title/tt11330500/,탕탕 라이브,https://www.imdb.com/title/tt11330500/fullcred...,Shiue Bin Jian,,"Dean Tang, Wan-Ru Zhan, Zhi-Ying Zhu, Jie-Fei ..."
1088,https://www.imdb.com/title/tt10742204/,대송궁사,https://www.imdb.com/title/tt10742204/fullcred...,Shaohong Li,,"Winston Chao, Tao Liu, Vic Chou, Ah-Lei Gua"


---

# 7. 데이터 합치기

In [16]:
import pandas as pd

# 원본 XLSX 파일 로드
file_path = '../data/justwatch_second_1090.xlsx'
original_df = pd.read_excel(file_path)

# df_result는 이미 생성된 데이터프레임이라고 가정
# 예시로 df_result가 존재한다고 가정합니다.
# df_result = pd.DataFrame(data)


# 두 DataFrame 병합
merged_df = pd.merge(original_df, df_result, on=['IMDb_URL', 'title'], how='left')

In [17]:
merged_df.tail()

Unnamed: 0.1,Unnamed: 0,title,original_title,year,season_episode,runtime,genre,age_rating,Production country,IMDb_title,IMDb_URL,fullcredits_url,director,writer,actor
1087,1085,달러,Dollar,2019,1,41min,"액션, 드라마, 로맨스",15,레바논,Dollar (TV Series 2019),https://www.imdb.com/title/tt10687564/,https://www.imdb.com/title/tt10687564/fullcred...,Samer Al Barkawi,Hisham Hilal,"Mark Lewis, Adel Karam, Amel Bouchoucha, Jarro..."
1088,1086,러브 나우,真愛趁現在,2012,1,45min,"드라마, 코미디",15,대만,Love Now (TV Series 2012–2013),https://www.imdb.com/title/tt6273116/,https://www.imdb.com/title/tt6273116/fullcredi...,Charley Lien,"Hsiang-Min Huang, Hui-Ting Shao, Pei-Yu Lin, Y...","Annie Chen, George Hu, Chih-kung Tou, Vivi Lee..."
1089,1087,탕탕 라이브,糖糖Online,2019,1,24min,드라마,15+,대만,Tang tang online (TV Series 2019),https://www.imdb.com/title/tt11330500/,https://www.imdb.com/title/tt11330500/fullcred...,Shiue Bin Jian,,"Dean Tang, Wan-Ru Zhan, Zhi-Ying Zhu, Jie-Fei ..."
1090,1088,대송궁사,大宋宫词,2021,1,45min,드라마,15+,China,Palace of Devotion (TV Series 2021),https://www.imdb.com/title/tt10742204/,https://www.imdb.com/title/tt10742204/fullcred...,Shaohong Li,,"Winston Chao, Tao Liu, Vic Chou, Ah-Lei Gua"
1091,1089,"엘리트들, 못다 한 이야기: 나디아 구스만",Elite Histórias Breves: Nadia Guzmán,2021,1,11min,"드라마, 로맨스",,스페인,Elite Short Stories: Nadia Guzmán (TV Mini Ser...,https://www.imdb.com/title/tt14671790/,https://www.imdb.com/title/tt14671790/fullcred...,Dani de la Orden,Carlos Montero,"Miguel Bernardeau, Mina El Hammani, Omar Ayuso..."


In [18]:
# 삭제할 컬럼 목록
columns_to_drop = ['season_episode', 'runtime', 'genre', 'age_rating', 'Production country', 'IMDb_title', 'IMDb_URL']

# 특정 컬럼 제거
merged_df.drop(columns=columns_to_drop, inplace=True)

# 결과 확인
merged_df.tail()

Unnamed: 0.1,Unnamed: 0,title,original_title,year,fullcredits_url,director,writer,actor
1087,1085,달러,Dollar,2019,https://www.imdb.com/title/tt10687564/fullcred...,Samer Al Barkawi,Hisham Hilal,"Mark Lewis, Adel Karam, Amel Bouchoucha, Jarro..."
1088,1086,러브 나우,真愛趁現在,2012,https://www.imdb.com/title/tt6273116/fullcredi...,Charley Lien,"Hsiang-Min Huang, Hui-Ting Shao, Pei-Yu Lin, Y...","Annie Chen, George Hu, Chih-kung Tou, Vivi Lee..."
1089,1087,탕탕 라이브,糖糖Online,2019,https://www.imdb.com/title/tt11330500/fullcred...,Shiue Bin Jian,,"Dean Tang, Wan-Ru Zhan, Zhi-Ying Zhu, Jie-Fei ..."
1090,1088,대송궁사,大宋宫词,2021,https://www.imdb.com/title/tt10742204/fullcred...,Shaohong Li,,"Winston Chao, Tao Liu, Vic Chou, Ah-Lei Gua"
1091,1089,"엘리트들, 못다 한 이야기: 나디아 구스만",Elite Histórias Breves: Nadia Guzmán,2021,https://www.imdb.com/title/tt14671790/fullcred...,Dani de la Orden,Carlos Montero,"Miguel Bernardeau, Mina El Hammani, Omar Ayuso..."


In [20]:
# director와 writer 컬럼의 빈 값을 NaN으로 변경
merged_df['director'] = merged_df['director'].replace('', np.nan)
merged_df['writer'] = merged_df['writer'].replace('', np.nan)

In [23]:
merged_df[merged_df['title'] == 'Vivant']

Unnamed: 0.1,Unnamed: 0,title,original_title,year,fullcredits_url,director,writer,actor
275,274,Vivant,VIVANT,2023,https://www.imdb.com/title/tt28314144/fullcred...,,"Chonmi Ri, Hayato Miyamoto, Hiroyuki Yatsu, Ka...","Masato Sakai, Fumi Nikaidô, Nandin-Erdene Khon..."


In [24]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       1092 non-null   int64 
 1   title            1092 non-null   object
 2   original_title   857 non-null    object
 3   year             1092 non-null   int64 
 4   fullcredits_url  1087 non-null   object
 5   director         1037 non-null   object
 6   writer           1018 non-null   object
 7   actor            1083 non-null   object
dtypes: int64(2), object(6)
memory usage: 68.4+ KB


In [25]:

# director, writer, actor 컬럼이 list 형태인지 확인하는 코드
def check_list_type(series):
    return series.apply(lambda x: isinstance(x, list))

is_director_list = check_list_type(merged_df['director'])
is_writer_list = check_list_type(merged_df['writer'])
is_actor_list = check_list_type(merged_df['actor'])

# 결과 출력
print("Director 컬럼이 list 형태인지 확인:")
print(is_director_list.value_counts())

print("\nWriter 컬럼이 list 형태인지 확인:")
print(is_writer_list.value_counts())

print("\nActor 컬럼이 list 형태인지 확인:")
print(is_actor_list.value_counts())


Director 컬럼이 list 형태인지 확인:
director
False    1092
Name: count, dtype: int64

Writer 컬럼이 list 형태인지 확인:
writer
False    1092
Name: count, dtype: int64

Actor 컬럼이 list 형태인지 확인:
actor
False    1092
Name: count, dtype: int64


In [26]:
# 각 요소를 리스트로 변환하는 함수
def convert_to_list(value):
    if pd.isna(value):
        return np.nan
    return value.split(', ')

# director, writer, actor 컬럼을 리스트로 변환
merged_df['director'] = merged_df['director'].apply(convert_to_list)
merged_df['writer'] = merged_df['writer'].apply(convert_to_list)
merged_df['actor'] = merged_df['actor'].apply(convert_to_list)

In [27]:
# director, writer, actor 컬럼이 list 형태인지 확인하는 코드
def check_list_type(series):
    return series.apply(lambda x: isinstance(x, list))

is_director_list = check_list_type(merged_df['director'])
is_writer_list = check_list_type(merged_df['writer'])
is_actor_list = check_list_type(merged_df['actor'])

# 결과 출력
print("Director 컬럼이 list 형태인지 확인:")
print(is_director_list.value_counts())

print("\nWriter 컬럼이 list 형태인지 확인:")
print(is_writer_list.value_counts())

print("\nActor 컬럼이 list 형태인지 확인:")
print(is_actor_list.value_counts())

Director 컬럼이 list 형태인지 확인:
director
True     1037
False      55
Name: count, dtype: int64

Writer 컬럼이 list 형태인지 확인:
writer
True     1018
False      74
Name: count, dtype: int64

Actor 컬럼이 list 형태인지 확인:
actor
True     1083
False       9
Name: count, dtype: int64


In [30]:
# title과 fullcredits_url 컬럼의 조합이 중복되는지 확인
duplicated_rows = merged_df[merged_df.duplicated(subset=['title', 'fullcredits_url'], keep=False)]
duplicated_rows

Unnamed: 0.1,Unnamed: 0,title,original_title,year,fullcredits_url,director,writer,actor
99,99,탑 보이,Top Boy,2019,https://www.imdb.com/title/tt1830379/fullcredi...,"[Aneil Karia, Brady Hood, Jonathan van Tulleke...","[Daniel West, Elliot Warren, Gerry Jackson, Ro...","[Ashley Walters, Kano, Jasmine Jobson, Little ..."
100,99,탑 보이,Top Boy,2019,https://www.imdb.com/title/tt1830379/fullcredi...,"[Aneil Karia, Brady Hood, Jonathan van Tulleke...","[Daniel West, Elliot Warren, Gerry Jackson, Ro...","[Ashley Walters, Kano, Jasmine Jobson, Little ..."
613,612,너에게는 닿지 않아,君には届かない,2023,https://www.imdb.com/title/tt22001792/fullcred...,"[Takehiko Shinjô, Takeo Kikuchi]","[Hayato Miyamoto, Karuho Shiina]","[Oji Suzuka, Sara Minami, Rinka Kumada, Atsuhi..."
614,612,너에게는 닿지 않아,君には届かない,2023,https://www.imdb.com/title/tt22001792/fullcred...,"[Takehiko Shinjô, Takeo Kikuchi]","[Hayato Miyamoto, Karuho Shiina]","[Oji Suzuka, Sara Minami, Rinka Kumada, Atsuhi..."


In [31]:
# 원본 DataFrame에 중복 제거 결과를 반영하려면
merged_df = merged_df.drop_duplicates(subset=['title', 'fullcredits_url'])

In [32]:
# title과 fullcredits_url 컬럼의 조합이 중복되는지 확인
duplicated_rows = merged_df[merged_df.duplicated(subset=['title', 'fullcredits_url'], keep=False)]
duplicated_rows

Unnamed: 0.1,Unnamed: 0,title,original_title,year,fullcredits_url,director,writer,actor


In [33]:
# 결과를 CSV 파일로 저장
csv_path = '../data/imdb_crew_cast_수정본.csv'
merged_df.to_csv(csv_path, index=False, encoding='utf-8')

# 결과를 XLSX 파일로 저장
xlsx_path = '../data/imdb_crew_cast_수정본.xlsx'
merged_df.to_excel(xlsx_path, index=False)

In [34]:
# 삭제할 컬럼 목록
columns_to_drop = ['fullcredits_url']

# 특정 컬럼 제거
merged_df.drop(columns=columns_to_drop, inplace=True)

# 결과 확인
merged_df.tail()

Unnamed: 0.1,Unnamed: 0,title,original_title,year,director,writer,actor
1087,1085,달러,Dollar,2019,[Samer Al Barkawi],[Hisham Hilal],"[Mark Lewis, Adel Karam, Amel Bouchoucha, Jarr..."
1088,1086,러브 나우,真愛趁現在,2012,[Charley Lien],"[Hsiang-Min Huang, Hui-Ting Shao, Pei-Yu Lin, ...","[Annie Chen, George Hu, Chih-kung Tou, Vivi Le..."
1089,1087,탕탕 라이브,糖糖Online,2019,[Shiue Bin Jian],,"[Dean Tang, Wan-Ru Zhan, Zhi-Ying Zhu, Jie-Fei..."
1090,1088,대송궁사,大宋宫词,2021,[Shaohong Li],,"[Winston Chao, Tao Liu, Vic Chou, Ah-Lei Gua]"
1091,1089,"엘리트들, 못다 한 이야기: 나디아 구스만",Elite Histórias Breves: Nadia Guzmán,2021,[Dani de la Orden],[Carlos Montero],"[Miguel Bernardeau, Mina El Hammani, Omar Ayus..."


In [35]:
# 결과를 CSV 파일로 저장
csv_path = '../data/imdb_crew_cast_최종본.csv'
merged_df.to_csv(csv_path, index=False, encoding='utf-8')

# 결과를 XLSX 파일로 저장
xlsx_path = '../data/imdb_crew_cast_최종본.xlsx'
merged_df.to_excel(xlsx_path, index=False)

---

### 지난 번에 있었던 오류 해결
- 감독과 작가의 CSS_Selecotr를 카피한 뒤 오류가 없이 해결됨을 확인