## 라이브러리 로드

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-whitegrid')
plt.rc('font', family = 'Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# url 접근
import urllib.request as req
from urllib.request import urlopen
from urllib.error import HTTPError, URLError

# html 문서 접근
from bs4 import BeautifulSoup

# dict를 url 변수명 = 값으로 바꿔줘서, url을 완성
import urllib.parse

import math
import tqdm

# 영화 데이터 수집

## 크롤링 함수 생성

### get_movie_info 영화 제목, 평점, 장르, 국가, 개봉일

In [3]:
# 영화 제목, 평점, 장르, 국가, 개봉일
def get_movie_info(movie_code) :
    '''
    영화 제목, 평점, 장르, 국가, 개봉일 받는 함수
    input : movie_code = 영화 코드
    return : mv_name_lst(이름), score_point_lst(관람객평점), spc_point_lst(기자평론가평점)
        net_point_lst(네티즌 평점), genre_lst(장르), nation_lst(국가), open_lst(개봉일)  
    '''
    
    mv_name_lst, score_point_lst, spc_point_lst, net_point_lst = [], [], [], []
    genre_lst, nation_lst, open_lst = [], [], []
    try :
        url = f"https://movie.naver.com/movie/bi/mi/point.naver?code={movie_code}"
        page = urlopen(url)
        soup = BeautifulSoup(page, 'html.parser')
    except HTTPError as e :
        print(f'{movie_code}, HTTP Error 입니다.', e)
    except URLError as e :
        print(f'{movie_code}, URL Error 입니다.', e)
    except :
        print(f'{movie_code}, 오류')
    else :
        # 영화 제목
        try :
            mv_info = soup.find('div', class_='mv_info')
            mv_name = mv_info.find('h3', class_='h_movie').a.get_text() .strip('\r\t\n')   
            mv_name_lst.append(mv_name)
        except :
            print(f'{movie_code}, 영화 제목 부분 오류')
        
        # 영화 평점            
        try :
            score = soup.find('a', id='actualPointPersentBasic').find_all('em')
            score_point = f"{score[0].get_text()}{score[1].get_text()}{score[2].get_text()}{score[3].get_text()}"
            score_point_lst.append(score_point)
        except :
            print(f'{movie_code}, 관람객 평점 부분 오류 ')

        try :
            spc_score = mv_info.find('div', class_='spc_score_area').find_all('em')[1:]
            spc_point = f"{spc_score[0].get_text()}{spc_score[1].get_text()}{spc_score[2].get_text()}{spc_score[3].get_text()}"
            spc_point_lst.append(spc_point)
        except :
            print(f'{movie_code}, 기자 평론가 평점 부분 오류 ')
        try :
            net_score = soup.find('a', id='pointNetizenPersentBasic').find_all('em')
            net_point = f"{net_score[0].get_text()}{net_score[1].get_text()}{net_score[2].get_text()}{net_score[3].get_text()}"
            net_point_lst.append(net_point)
        except :
            print(f'{movie_code}, 네티즌 평점 부분 오류 ')    
    
        
        # 영화 장르, 국가, 개봉일
        try :
            dl_tag = soup.find('dl', class_='info_spec')
            info = dl_tag.find_all('a')
        except :
            print(f'{movie_code}, 장르, 국가, 개봉일 접근 오류')
        else :
            try :
                for i in info :
                    if i['href'].find('genre') != -1 :
                        genre_lst.append(i.get_text().strip('\r\t\n'))
                    elif i['href'].find('nation') != -1 :
                        nation_lst.append(i.get_text().strip('\r\t\n'))
                    elif i['href'].find('open') != -1 :
                        open_lst.append(i.get_text().strip('\r\t\n'))
                    else :
                        pass
            except :
                print(f'{movie_code},  장르, 국가, 개봉일 접근 후 데이터 입력시 오류')
          
    return mv_name_lst, score_point_lst, spc_point_lst, net_point_lst, genre_lst, nation_lst, open_lst   

### get_actor_director 감독, 배우 가져오는 함수

In [4]:
def get_actor_director(movie_code) :
    '''
    영화 감독, 배우 가져오는 함수
    input : movie_code = 영화 코드
    return : actor_lst, director_lst : 배우, 감독을 담은 list형태
    '''
    actor_lst, director_lst = [], []
    try :
        url_ac = f"https://movie.naver.com/movie/bi/mi/detail.naver?code={movie_code}"
        page_ac = urlopen(url_ac)
        soup_ac = BeautifulSoup(page_ac, 'html.parser')
    except HTTPError as e :
        print(f'{movie_code}, HTTP Error 입니다.', e)
    except URLError as e :
        print(f'{movie_code}, URL Error 입니다.', e)
    except :
        print(f'{movie_code}, 오류')
    else :
        ###### 배우
        try :
            act = soup_ac.find('ul', class_ = 'lst_people')
            actor_name = act.find_all('a', class_='k_name')
        except :
            print('배우 태그 접근 오류')
        else :
            for a in actor_name :
                actor_lst.append(a.get_text().strip('\r\t\n'))

        ###### 감독
        try :        
            direc = soup_ac.find('div', class_='director')
            direc_name = direc.find_all('a', class_='k_name')
        except :
            print('감독 태그 접근 오류')
        else :
            for d in direc_name :
                director_lst.append(d.get_text().strip('\r\t\n'))
        
    return  actor_lst, director_lst

### get_prefer_info : 선호 연령 & 성별, 감상 포인트 가져오는 함수

In [6]:
def get_prefer_info(movie_code) :
    '''
    선호 연령, 성별, 감상 포인트 가져오는 함수
    input : movie_code = 영화 코드
    return : prefer_lst(선호 연령&성별), view_point_lst(감상포인트), directing_lst(연출 선호 비율) 
            acting_lst(연기 선호 비율), story_lst(스토리 선호 비율), v_beauty_lst(영상미 선호 비율), ost_lst(ost 선호 비율)
    '''
    prefer_lst, view_point_lst, directing_lst, acting_lst, story_lst, v_beauty_lst, ost_lst = [], [], [], [], [], [], []
    try :
        url_pv = f'https://movie.naver.com/movie/bi/mi/point.naver?code={movie_code}'
        page_pv = urlopen(url_pv)
        soup_pv = BeautifulSoup(page_pv, 'html.parser')
    except HTTPError as e :
        print(f'{movie_code}, HTTP Error 입니다.', e)
    except URLError as e :
        print(f'{movie_code}, URL Error 입니다.', e)
    except :
        print(f'{movie_code}, 오류')
    else :
        ## 선호 연령& 성별
        try :
            comment = soup_pv.find('strong', class_='grp_review').get_text()
        except :
            print('평점 없음')
        else :
            prefer_lst.append(comment[6:12])
            view_point_lst.append(comment[19:-12])


            # 감상포인트 선호 비율
            try : 
                view_point_rate = soup_pv.find('ul', class_='grp_point').find_all('span', class_='grp_score')
            except :
                print('감상 포인트 선호 비율 없음')
            else :
                directing_lst.append(view_point_rate[0].get_text().strip('\r\t\n'))
                acting_lst.append(view_point_rate[1].get_text().strip('\r\t\n'))
                story_lst.append(view_point_rate[2].get_text().strip('\r\t\n'))
                v_beauty_lst.append(view_point_rate[3].get_text().strip('\r\t\n'))
                ost_lst.append(view_point_rate[4].get_text().strip('\r\t\n'))
                
    return prefer_lst, view_point_lst, directing_lst, acting_lst, story_lst, v_beauty_lst, ost_lst

### get_comment : 영화 평점 comment, id 가져오는 함수

In [7]:
## 영화 평점, comment 가져오는 함수
def get_comment(movie_code) :
    '''
    영화 평점 id, comment를 가져오는 함수
    input : movie_code = 영화 코드
    return : {id, comment}의 dict형태를 담은 리스트
    '''
    page_no = 0
    comment_lst = []
    try :
        url_comment = f"https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code={movie_code}&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page={page_no+1}"
        page_comment = urlopen(url_comment)
        soup_comment = BeautifulSoup(page_comment, 'html.parser')
        total_comment = int(soup_comment.find('strong', class_='total').em.get_text().strip('\r\t\n').replace(',', ""))
        page_total_no = math.ceil(total_comment/10) 
    except HTTPError as e :
        print(f'{movie_code}, HTTP Error 입니다.', e)
    except URLError as e :
        print(f'{movie_code}, URL Error 입니다.', e)
    except :
        print(f'{movie_code}, 평점 코멘트 부분 오류')
    else :  
        page_total_no = math.ceil(total_comment/10)  

        for page_no in range(page_total_no) :
            try : 
                url_comment = f"https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code={movie_code}&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page={page_no+1}"
                page_comment = urlopen(url_comment)
                soup_comment = BeautifulSoup(page_comment, 'html.parser')
            except HTTPError as e :
                print(f'{movie_code}, HTTP Error 입니다.', e)
            except URLError as e :
                print(f'{movie_code}, URL Error 입니다.', e)
            else :    
                for i in range(10) :
                    try :
                        comment_tag = soup_comment.find_all('div', class_='score_reple')[i]
                        comment = comment_tag.find('span', id=f'_filtered_ment_{i}').get_text().strip('\r\t\n')
                        comment_id = comment_tag.find_all('span')[-1].get_text().strip('\r\t\n')

                        # id와 comment를 dict형식으로 넣는다.
                        comment_dict = dict({comment_id : comment})
                        comment_lst.append(comment_dict)
                    except :
                        pass  
    return comment_lst

### get_related_movie : 관련 영화 가져오기

In [8]:
def get_related_movie(movie_code) :
    '''
    관련 영화 가져오는 함수
    input : movie_code = 영화 코드
    return : 
    '''
    relate_movie_lst = []
    try :
        url_relate = f"https://movie.naver.com/movie/bi/mi/scriptAndRelate.naver?code={movie_code}"
        page_relate = urlopen(url_relate)
        soup_relate = BeautifulSoup(page_relate, 'html.parser')
    except HTTPError as e :
        print(f'{movie_code}, HTTP Error 입니다.', e)
    except URLError as e :
        print(f'{movie_code}, URL Error 입니다.', e)
    except :
        print(f'{movie_code}, 오류')
    else :
        try :
            rm_lst = soup_relate.find_all('a', class_='movie_title')
            for rmovie in rm_lst :
                relate_movie_lst.append(rmovie.get_text().strip('\r\t\n'))
        except :
            print(f'{movie_code}, 관련 영화 관련 오류')
    return relate_movie_lst

In [9]:
movie = [92132, 205966, 92341, 191597, 100001]

for i in movie :
#     print(get_related_movie(i))
#     print(get_comment(i))
    print(get_prefer_info(i))
    print(get_movie_info(i))

평점 없음
([], [], [], [], [], [], [])
92132, 관람객 평점 부분 오류 
92132, 기자 평론가 평점 부분 오류 
92132, 네티즌 평점 부분 오류 
(['갱스터 스쿼드'], [], [], [], ['범죄', '드라마'], ['미국'], [])
(['40대 여성'], ['연기'], ['24%'], ['27%'], ['19%'], ['23%'], ['7%'])
(['아이스 로드'], ['8.38'], ['5.33'], ['7.83'], ['스릴러'], ['미국'], [' 2021', '.07.21'])
평점 없음
([], [], [], [], [], [], [])
92341, 관람객 평점 부분 오류 
92341, 기자 평론가 평점 부분 오류 
92341, 네티즌 평점 부분 오류 
(['노 룩 패스'], [], [], [], ['다큐멘터리', '가족'], ['미국'], [])
(['40대 남성'], ['연출'], ['33%'], ['26%'], ['19%'], ['13%'], ['9%'])
(['보스 베이비 2'], ['9.59'], ['5.50'], ['9.25'], ['애니메이션', '코미디', '모험'], ['미국'], [' 2021', '.07.21'])
평점 없음
([], [], [], [], [], [], [])
100001, 관람객 평점 부분 오류 
100001, 기자 평론가 평점 부분 오류 
100001, 네티즌 평점 부분 오류 
(['와라! 편의점'], [], [], [], ['애니메이션', '코미디'], ['한국'], [])


## DataFrame 생성

In [161]:
cols =['영화코드', '영화제목', '관람객평점', '기자평론가평점', '네티즌평점', '장르', '국가', '개봉일', 
        '배우', '감독', '선호연령성별', '감상포인트', '연출선호', '연기선호', '스토리선호', '영상미선호', 'ost선호',
      'comment', '관련영화']

df = pd.DataFrame(np.zeros((230000, 19)), columns=cols)
df

Unnamed: 0,영화코드,영화제목,관람객평점,기자평론가평점,네티즌평점,장르,국가,개봉일,배우,감독,선호연령성별,감상포인트,연출선호,연기선호,스토리선호,영상미선호,ost선호,comment,관련영화
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
movie = list(range(10000, 207447))

for i, movie_code in tqdm.tqdm(enumerate(movie)):
    print(f'-------------------------{i}번째------------------------------')
    if i % 10000 == 0:
        df.to_csv(f'./crawling/df_{i}.csv', index=False)
    print(i, movie_code)
    mv_name, score_point, spc_point, net_point, genre, nation, opend = get_movie_info(movie_code)
    if len(mv_name) == 0 :
        pass
    else :
        actor, director = get_actor_director(movie_code)
        prefer, view_point, directing, acting, story, v_beauty, ost = get_prefer_info(movie_code)
        comment = get_comment(movie_code)
        relate_movie = get_related_movie(movie_code)

        data_lst = [movie_code, mv_name, score_point, spc_point, net_point, genre, nation, opend,
                   actor, director,
                   prefer, view_point, directing, acting, story, v_beauty, ost,
                   comment, relate_movie]                                     
        for j, data in enumerate(data_lst) :
            if j == 0 :
                df.iloc[i, j] = data
            elif len(data) == 1 :
                df.iloc[i, j] = data[0]
            elif len(data) > 1 :
                df.iloc[i, j] = str(data).strip("[]")
            else :
                pass

print('---------끝---------------------')
df.to_csv('./crawling/final.csv', index=False)

0it [00:00, ?it/s]

-------------------------0번째------------------------------


1it [00:01,  1.38s/it]

0 2
2, 영화 제목 부분 오류
2, 관람객 평점 부분 오류 
2, 기자 평론가 평점 부분 오류 
2, 네티즌 평점 부분 오류 
2, 장르, 국가, 개봉일 접근 오류
-------------------------1번째------------------------------
1 188466


2it [00:02,  1.48s/it]

-------------------------2번째------------------------------
2 3
3, 영화 제목 부분 오류
3, 관람객 평점 부분 오류 
3, 기자 평론가 평점 부분 오류 
3, 네티즌 평점 부분 오류 
3, 장르, 국가, 개봉일 접근 오류
-------------------------3번째------------------------------
3 186107
186107, 관람객 평점 부분 오류 
186107, 기자 평론가 평점 부분 오류 
186107, 네티즌 평점 부분 오류 
배우 태그 접근 오류
평점 없음
186107, 평점 코멘트 부분 오류


4it [00:04,  1.06it/s]

-------------------------4번째------------------------------
4 4
4, 영화 제목 부분 오류
4, 관람객 평점 부분 오류 
4, 기자 평론가 평점 부분 오류 
4, 네티즌 평점 부분 오류 
4, 장르, 국가, 개봉일 접근 오류
-------------------------5번째------------------------------
5 120310
120310, 관람객 평점 부분 오류 
120310, 기자 평론가 평점 부분 오류 
120310, 네티즌 평점 부분 오류 
평점 없음
120310, 평점 코멘트 부분 오류


6it [00:05,  1.27it/s]

-------------------------6번째------------------------------
6 196049
196049, 관람객 평점 부분 오류 
196049, 기자 평론가 평점 부분 오류 
196049, 네티즌 평점 부분 오류 
평점 없음
196049, 평점 코멘트 부분 오류


9it [00:06,  1.87it/s]

-------------------------7번째------------------------------
7 3
3, 영화 제목 부분 오류
3, 관람객 평점 부분 오류 
3, 기자 평론가 평점 부분 오류 
3, 네티즌 평점 부분 오류 
3, 장르, 국가, 개봉일 접근 오류
-------------------------8번째------------------------------
8 21
21, 영화 제목 부분 오류
21, 관람객 평점 부분 오류 
21, 기자 평론가 평점 부분 오류 
21, 네티즌 평점 부분 오류 
21, 장르, 국가, 개봉일 접근 오류
-------------------------9번째------------------------------
9 45


10it [00:06,  1.48it/s]


45, 영화 제목 부분 오류
45, 관람객 평점 부분 오류 
45, 기자 평론가 평점 부분 오류 
45, 네티즌 평점 부분 오류 
45, 장르, 국가, 개봉일 접근 오류
---------끝---------------------


## 결과 출력

In [11]:
df=pd.read_csv('./crawling/df_3000.csv')
df = df.iloc[:3000, :].copy()

df = df[df['영화코드']!=0].copy()
df.to_csv('./crawling/df_3000.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
df=pd.read_csv('./crawling/df_3000.csv')
df

Unnamed: 0.1,Unnamed: 0,영화코드,영화제목,관람객평점,기자평론가평점,네티즌평점,장르,국가,개봉일,배우,감독,선호연령성별,감상포인트,연출선호,연기선호,스토리선호,영상미선호,ost선호,comment,관련영화
0,0,10801.0,비버리 힐의 낮과 밤,0.0,0.0,0.00,코미디,미국,0.0,"'닉 놀테', '베트 미들러', '리차드 드레이퓨즈', '리틀 리처드', '트레이시...",폴 마주르스키,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'사랑의 울타리', '블랙 위도우', '랑종', '잠복근무', '캣츠 앤 독스 2'..."
1,1,10802.0,김의 전쟁,0.0,0.0,8.60,드라마,한국,"' 1992', '.02.29'","'유인촌', '이혜숙', '김복희', '김형일', '신우철', '심우창', '김형진...",김영빈,40대 남성,"스토리, 연기",20%,40%,40%,0%,0%,"{'kos4****': '당시 이 영화를 보고 큰 충격을 받았었는데... '}, {...","'사랑이 꽃피는 나무', '나에게 오라', '비상구가 없다', '세자매', '다우트..."
2,2,10803.0,황무지,0.0,0.0,0.00,"'범죄', '스릴러', '드라마'",미국,0.0,"'마틴 쉰', '씨씨 스페이식', '워렌 오티스', '라몬 비어리', '앨런 빈트'...",테렌스 맬릭,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'블랙 위도우: 파이널 챕터', '인질', '살인의 추억', '마약전쟁', '천재 ..."
3,3,10804.0,롤라 몽테스,0.0,0.0,0.00,드라마,"'프랑스', '독일(구 서독)'",0.0,"'마르틴 캐롤', '피터 유스티노프', '안톤 월브룩', '헨리 귀솔', '리스 데...",막스 오퓔스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'마담D', '위험한 관계', '로렌조 오일', '쿼바디스', '사해살인사건', '..."
4,4,10805.0,101마리의 달마시안 개,0.0,0.0,0.00,"'애니메이션', '모험', '가족'",미국,0.0,"'로드 테일러', 'J. 팻 오말리', '베티 루 거슨', '마사 웬트워스', '벤...","'클라이드 제로니미', '해밀턴 러스크', '울프강 라이트만'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'피터 팬', '보스 베이비 2', '겨울왕국 2', '센과 치히로의 행방불명', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2975,2995,13796.0,골목대장 형래와 검은 망또,0.0,0.0,8.14,"'가족', '액션'",한국,"' 1992', '.06.14'","'심형래', '정선경', '조금산', '이정희', '황인조', '정봉연', '안남'...",이재규,20대 남성,"연출, 연기",50%,50%,0%,0%,0%,"{'wnrd****': '어렸을때 존장 재밌었다. '}, {'maxp****': '...",0.0
2976,2996,13797.0,두만강아 잘 있거라,0.0,0.0,6.00,"'액션', '전쟁'",한국,"' 1962', ''","'김석훈', '문정숙', '엄앵란', '황해', '박노식', '장혁', '황정순',...",임권택,20대 남성,,0%,0%,0%,0%,0%,"{'LP(ogr5****)': '임권택 감독의 시작은 뿌리부터 굳건했다. '}, {...","'밀정 1930', '장군의 아들', '더 킹: 독립 전쟁', '아바타', '300..."
2977,2997,13798.0,프리찌스 오너,0.0,0.0,8.26,"'코미디', '범죄', '드라마', '멜로/로맨스'",미국,"' 1986', '.08.30'","'잭 니콜슨', '캐서린 터너', '로버트 로지아', '존 랜돌프', '윌리엄 히키...",존 휴스턴,20대 남성,"스토리, OST, 연기",0%,33%,33%,0%,33%,"{'그라마(bbq2****)': '피와 돈으로 얼룩진 가문의 영광이라 '}, {'a...","'제2의 연인', '어바웃 슈미트', '맨 트러블', '바람둥이 길들이기', '이보..."
2978,2998,13799.0,마지막 챔프,0.0,0.0,0.00,0.0,남아프리카 공화국,0.0,"'로버트 본', '레아 핀센트', '제임스 라이언', '사이몬 폴란드', '윌슨 던...",쿠스 로츠,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df=pd.read_csv('./crawling/df_17800.csv')
df = df[df['영화코드']!=0.0].copy()
df.to_csv('./crawling/df_17800.csv', index=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
df=pd.read_csv('./crawling/df_17800.csv')
df

Unnamed: 0,영화코드,영화제목,관람객평점,기자평론가평점,네티즌평점,장르,국가,개봉일,배우,감독,선호연령성별,감상포인트,연출선호,연기선호,스토리선호,영상미선호,ost선호,comment,관련영화
0,10801.0,비버리 힐의 낮과 밤,0.0,0.0,0.0,코미디,미국,0.0,"'닉 놀테', '베트 미들러', '리차드 드레이퓨즈', '리틀 리처드', '트레이시...",폴 마주르스키,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'사랑의 울타리', '블랙 위도우', '랑종', '잠복근무', '캣츠 앤 독스 2'..."
1,10802.0,김의 전쟁,0.0,0.0,8.6,드라마,한국,"' 1992', '.02.29'","'유인촌', '이혜숙', '김복희', '김형일', '신우철', '심우창', '김형진...",김영빈,40대 남성,"스토리, 연기",20%,40%,40%,0%,0%,"{'kos4****': '당시 이 영화를 보고 큰 충격을 받았었는데... '}, {...","'사랑이 꽃피는 나무', '나에게 오라', '비상구가 없다', '세자매', '다우트..."
2,10803.0,황무지,0.0,0.0,0.0,"'범죄', '스릴러', '드라마'",미국,0.0,"'마틴 쉰', '씨씨 스페이식', '워렌 오티스', '라몬 비어리', '앨런 빈트'...",테렌스 맬릭,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'블랙 위도우: 파이널 챕터', '인질', '살인의 추억', '마약전쟁', '천재 ..."
3,10804.0,롤라 몽테스,0.0,0.0,0.0,드라마,"'프랑스', '독일(구 서독)'",0.0,"'마르틴 캐롤', '피터 유스티노프', '안톤 월브룩', '헨리 귀솔', '리스 데...",막스 오퓔스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'마담D', '위험한 관계', '로렌조 오일', '쿼바디스', '사해살인사건', '..."
4,10805.0,101마리의 달마시안 개,0.0,0.0,0.0,"'애니메이션', '모험', '가족'",미국,0.0,"'로드 테일러', 'J. 팻 오말리', '베티 루 거슨', '마사 웬트워스', '벤...","'클라이드 제로니미', '해밀턴 러스크', '울프강 라이트만'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'피터 팬', '보스 베이비 2', '겨울왕국 2', '센과 치히로의 행방불명', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16873,28593.0,애증의 덫,0.0,0.0,0.0,"'공포', '스릴러', '미스터리'",미국,0.0,"'튜즈데이 웰드', '조안 해킷', '샘 워터스톤', '루실 벤슨', '마이클 러너...",존 바담,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16874,28594.0,칠전사,0.0,0.0,0.0,0.0,홍콩,0.0,"'막소총', '정소추', '장학우', '홍금보', '양조위', '모순균', '우마'...","'홍금보', '당기명'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'독존자', '베스트 이즈 하이스트', '노랑구소', '증인', '전장', '마고일..."
16875,28597.0,해룡이와 달자의 추억의 책가방,0.0,0.0,0.0,"'가족', '코미디'",한국,0.0,"'임하룡', '김현영', '조금산', '김보라나', '최승경', '김국진', '최영...",조종우,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16876,28599.0,핫 스윗,0.0,0.0,0.0,스릴러,이탈리아,0.0,"'도널드 플레젠스', '소니아 라울', '시러스 일라이어스', '안나 갈리에나', ...",카를로 반지나,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"'블랙 위도우', '랑종', '황해', '추격자', '더 퍼지: 포에버', '50가..."


In [21]:
df['영화코드'].nunique()

16878