# 네이버 영화 리플 크롤링 및 감성분석

### Naver Movie Reple Crawling and Sentiment Analysis

## 1. 네이버 영화 크롤링 (별점,리플 -> 영화 제목별 파일)

#### step 1. Crawling ( score, reple -> .csv file )
#### start url : https://movie.naver.com/movie/running/current.nhn?order=reserve

In [None]:
# input : url
# return: bs4.BeautifulSoup
def requestPage(url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    return soup

# input : url
# return: movie code
def getMovieCode(url) :
    soup = requestPage(url)
    a_list = soup.select('ul.lst_detail_t1 > li > div.thumb > a')
    code_list = []
    for a_tag in a_list:
        href = a_tag.attrs['href'].strip()
        code = href.split('=')[1]
        code_list.append(code)
    return code_list

# input : url
# return: movie name
def getMovieName(url) :
    soup = requestPage(url)
    a_list = soup.select('ul.lst_detail_t1 > li > dl.lst_dsc > dt.tit > a')
    name_list = []
    for a_tag in a_list:
        name = a_tag.text.strip()
        name = name.replace(':','')
        name_list.append(name)
    return name_list

In [16]:
# input : soup
# return: (score, reple) in a page
def onePageData(soup):
    div_result_tag = soup.select('div.score_result')
    star_score_list = []
    reple_list = []
    for tag in div_result_tag:
        star_score_list += tag.select('div.star_score > em')
        reple_list += tag.select('div.score_reple > p')
    result = []
    for i in range(len(star_score_list)):
        a_score = star_score_list[i].text
        a_reple = reple_list[i].text
        a_tuple = (a_score,a_reple)
        result.append(a_tuple)
    return result

# input : movie_code
# output: total reple_pages of movie
def getPageCnt(movie_code):
    url = f'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code={movie_code}&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page=1'
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    total_score = soup.select('div.score_total > strong.total > em')[0].text
    # print(soup.select('div.score_total > strong.total > em')[0].prettify())
    total_score = total_score.replace(',','')
    page = int(total_score) // 10
    if ( int(total_score) % 10) != 0:
        page += 1
    return page

In [1]:
import bs4
import requests
import time
import pandas as pd # 데이터 저장

In [18]:
# get movie Code and Name
url = 'https://movie.naver.com/movie/running/current.nhn?order=reserve'
movie_list = getMovieCode(url)
file_list = getMovieName(url)
n_movies = len(movie_list)

In [20]:
# save each movie_name.csv files 
for idx in range(n_movies):
    pages = getPageCnt(movie_list[idx])
    print( f'{file_list[idx]} : {pages} reple pages')
    save_file = f'./movie_reples/{file_list[idx]}.csv'
    
    for i in range(pages):
        if (i%20 == 0): time.sleep(1)
        #if (i == 5) : break  # 페이지 수 제한
        
        site = f'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code={movie_list[idx]}&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page={i+1}'
        soup = requestPage(site)
        data = onePageData(soup)

        df = pd.DataFrame(data)
        if i == 0:
            df.columns = ['score', 'reple']
            df.to_csv(save_file, index=False, encoding='utf-8-sig')
        else:
            df.to_csv(save_file, header=False, 
                      index=False, mode='a', encoding='utf-8-sig')

돈 : 55 reple pages
캡틴 마블 : 3022 reple pages
우상 : 24 reple pages
악질경찰 : 19 reple pages
이스케이프 룸 : 143 reple pages
라스트 미션 : 44 reple pages
그린 북 : 451 reple pages
더 페이버릿 여왕의 여자 : 150 reple pages
아사코 : 21 reple pages
극한직업 : 4448 reple pages
살인마 잭의 집 : 35 reple pages
더 와이프 : 24 reple pages
숲속왕국의 꿀벌 여왕 : 5 reple pages
항거유관순 이야기 : 636 reple pages
철벽선생 : 12 reple pages
가버나움 : 236 reple pages
증인 : 1015 reple pages
1919 유관순 : 22 reple pages
그때 그들 : 4 reple pages
칠곡 가시나들 : 33 reple pages
국경의 왕 : 6 reple pages
에브리타임 룩 앳 유 : 3 reple pages
콜드 워 : 27 reple pages
나는 다른 언어로 꿈을 꾼다 : 4 reple pages
킹 오브 프리즘 -샤이니 세븐 스타즈- : 6 reple pages
사바하 : 1168 reple pages
로마 : 193 reple pages
일일시호일 : 75 reple pages
리노 : 7 reple pages
러브 라이브! 선샤인!! 더 스쿨 아이돌 무비 오버 더 레인보우 : 27 reple pages
보헤미안 랩소디 : 3858 reple pages
빠삐용 : 25 reple pages
도쿄의 밤하늘은 항상 가장 짙은 블루 : 22 reple pages
히치하이크 : 2 reple pages
신데렐라마법 반지의 비밀 : 92 reple pages
님포매니악 볼륨2 : 38 reple pages
내가 사는 세상 : 4 reple pages
님포매니악 볼륨1 : 46 reple pages
인생 후르츠 : 107 reple 

## 2. 크롤링 데이터 통합

#### step 2. Data Integration

In [2]:
import os

In [3]:
# make list of saved files
read_file_list = os.listdir('./movie_reples')

In [19]:
n_file = len(read_file_list)
save_file = './movie_total_reples.csv'
for i in range(n_file):
    file_name = f'./movie_reples/{read_file_list[i]}'
    data = pd.read_csv(file_name)
    if i == 0:
        data.to_csv(save_file, index=False, encoding='utf-8-sig')
    else:
        data.to_csv(save_file, header=False, 
                  index=False, mode='a', encoding='utf-8-sig')

## 3. 수집된 평점으로 감성분석

#### step 3. Sentiment Analysis

In [25]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pickle
from konlpy.tag import *  
# pip install konlpy 에러 시 Visual Studio 설치(Windows)

In [None]:
df = pd.read_csv('./movie_total_reples.csv')
#df.head(5)

# 전처리
def text_preprocessing(text):
    if text.startswith('관람객'):
        text = text[3:]
    return text
def score_preprocessing(text):
    value = int(text)
    if value <= 5 :
        return '0'
    else:
        return '1'
# 문자열마다 함수 호출
df['reple'] = df['reple'].apply(text_preprocessing) 
df['score'] = df['score'].apply(score_preprocessing) 
df.to_csv('./movie_total_reples_pre.csv', index=False, encoding='utf-8-sig')
print('전처리 완료')

In [44]:
# train, test split
df = pd.read_csv('./movie_total_reples_pre.csv')
score_list = df['score'].tolist()
reple_list = df['reple'].tolist()

reple_train, reple_test, score_train, score_test \
    = train_test_split(reple_list, score_list, test_size=0.3)

# train, test to files
dic_train = {
    'score' : score_train,
    'reple' : reple_train
}
dic_test = {
    'score' : score_test,
    'reple' : reple_test
}

df_train = pd.DataFrame(dic_train)
df_test = pd.DataFrame(dic_test)

df_train.to_csv('train.csv', index=False, encoding='utf-8-sig')
df_test.to_csv('test.csv', index=False, encoding='utf-8-sig')

In [48]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

X_train = train_df['reple'].tolist()
y_train = train_df['score'].tolist()
X_test = test_df['reple'].tolist()
y_test = test_df['score'].tolist()

# 모델 객체 생성
okt = Okt()  # 트위터가 제공하는 형태소 분석기
def text_tokenizer(text):
    return okt.morphs(text)

# 단어 사전 구축
word_mod = TfidfVectorizer(lowercase=False, tokenizer=text_tokenizer)
# L2 정규식 적용한 분류 모델
class_mod = LogisticRegression(C=10.0, penalty='l2')
# 파이프라인 구축
pipe = Pipeline([('vect',word_mod),('clf',class_mod)] )

# 학습
import time
start_time = time.time()
print('start')
pipe.fit(X_train, y_train)
print('end')
end_time = time.time()
print(f'time : {end_time - start_time}')

# 모델 저장
y_pred = pipe.predict(X_test)
print(f'정확도 : {accuracy_score(y_test, y_pred)}')

with open('pipe.dat', 'wb') as fp:
    pickle.dump(pipe, fp)
print('save complete')

start




end
time : 242.42454600334167
정확도 : 0.917950555356503
save complete


In [5]:
# 저장한 모델 활용
import pickle
import numpy as np
from konlpy.tag import * 

def text_tokenizer(text):
    okt = Okt()
    return okt.morphs(text)

with open('pipe.dat', 'rb')as fp:
    pipe = pickle.load(fp)
    
text = input('리뷰 작성 : ')
str1 = [text]

r1 = np.max(pipe.predict_proba(str1) * 100)
r2 = pipe.predict(str1)[0]

if r2 == 1:
    print('긍정적')
else:
    print('부정적')
print(f'정확도 : {r1}')

리뷰 작성 : 재미있어요
긍정적
정확도 : 99.85337898382643
