# [과제1] 네이버 평점 데이터 수집

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import re
import random
import time
import pandas as pd
import csv

In [2]:
driver = webdriver.Chrome('./chromedriver')
driver.implicitly_wait(3)

In [3]:
user_df = pd.read_csv('./data/naver_user.csv')

In [17]:
def get_user_list(limit):
    page = 1
    user_list = []
    while len(user_list) <= limit:
        driver.get('https://movie.naver.com/movie/point/af/list.nhn?&page={0}'.format(page))

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        reviewNum = soup.select('#old_content > table > tbody > tr > td.ac.num')
        userId = soup.select('table.list_netizen > tbody > tr > td > a.author')
        for i in range(len(reviewNum)):
            review = str(reviewNum[i]).replace('<td class="ac num">','').replace('</td>','')
            user = userId[i].text.replace('*','')
            if user not in [user[1] for user in user_list]:
                user_list.append((review, user))
        time.sleep(random.randrange(2, 5))
        page += 1
    return user_list[:limit]

In [5]:
def get_movie_link(soup):
    movie_links = soup.select('a[href]')
    movie_links_list = []
    for link in movie_links:
        if re.search(r'st=mcode&sword' and r'&target=after$', link['href']):
            target_url = 'https://movie.naver.com/movie/point/af/list.nhn'+str(link['href'])
            movie_links_list.append(target_url)
    
    return movie_links_list[1:]

In [6]:
def get_review(reviewNo):
    user_reviews = []
    page = 1
    
    #dirver & beautifulsoup init
    driver.get('https://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword={0}&target=after&page={1}'.format(
        reviewNo,page))
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    #해당 reviewNo을 작성한 사용자에 대한 웹페이지 가져오기
    while True:    #여러 페이지가 존재할 수도 있으므로, 반복문 사용
        driver.get('https://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword={0}&target=after&page={1}'.format(
            reviewNo,page))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        #다음 페이지가 존재하는지에 대한 확인
        hasNextPage = soup.select('#old_content > div.paging > div > a.pg_next')
        
        # 영화 링크를 가져오는 코드
        movie_links = get_movie_link(soup)
        movieId = [link.replace('https://movie.naver.com/movie/point/af/list.nhn?st=mcode&sword=','').replace(
            '&target=after','') for link in movie_links]

        #username 만들어주기
        user_name = user_df[user_df['reviewNo']==reviewNo]['userId'].iloc[0]

        #rating 만들어주기
        rating = soup.select('#old_content > table > tbody > tr > td.point')
        edited_rating = []
        for j in range(len(rating)):
            edited_rating.append(str(rating[j]).replace('<td class="point">','').replace('</td>',''))
        
        #reviewNumber 가져오기
        reviewNum_tag = soup.select('#old_content > table > tbody > tr > td.ac.num')
        
        #인자로 받은 reviewNo보다 큰 값은 pop
        for x in range(len(rating)):
            reviewNum_value = str(reviewNum_tag[x]).replace('<td class="ac num">','').replace('</td>','')
            if int(reviewNum_value) > reviewNo:
                edited_rating.pop(0)
                movieId.pop(0)

        #.csv 파일에 쓰기
        f = open('./data/rating.csv','a',newline='')
        wr = csv.writer(f)
        for i in range(len(edited_rating)):
            wr.writerow([user_name, edited_rating[i], movieId[i]])

        user_reviews = [{'userId': user_name,'rating':edited_rating, 'movieId':movieId}]
        
        time.sleep(random.randrange(3, 5))
        
        #다음 페이지가 있는 경우
        if hasNextPage:
            page += 1
            continue
        #다음 페이지가 없는 경
        else:
            break
            
        f.close()
    return user_reviews

In [7]:
def main():
    for i in user_df['reviewNo']:
        get_review(i)

if __name__ == "__main__":
    main()

#### csv 파일에 저장된 정보에 column명 추가해주기

In [8]:
newrating = pd.read_csv('./data/rating.csv', header = None,names = ['userId', 'rating', 'movieId'])
newrating.head(5)

Unnamed: 0,userId,rating,movieId
0,airf,2,136900
1,airf,10,163788
2,airf,10,174065
3,nanw,10,154667
4,nanw,10,136900


### 중복 제거

In [9]:
newrating = newrating.drop_duplicates()

In [10]:
newrating

Unnamed: 0,userId,rating,movieId
0,airf,2,136900
1,airf,10,163788
2,airf,10,174065
3,nanw,10,154667
4,nanw,10,136900
5,nanw,10,145335
6,nanw,9,120141
7,nanw,9,54704
8,zxcv,7,113351
9,zxcv,10,86507


#### 5명의 정보 추가하기 전에 타입 검사

In [11]:

print(type(newrating['userId'][0]))
print(type(newrating['rating'][0]))
print(type(newrating['movieId'][0]))

<class 'str'>
<class 'numpy.int64'>
<class 'numpy.int64'>


#### 추가된 5명의 정보를 DataFrame으로 만들어주기

In [12]:
sub_df = pd.DataFrame({'userId':['patl','yoya','ebc8','imag','glab'],
                      'rating':[10,10,1,10,8],
                      'movieId':[161967,161967,174065,161967,137327]})

#### 크롤링한 결과와 추가된 5명 합치기

In [26]:
result = pd.concat([newrating,sub_df]).reset_index(drop=True)
result

Unnamed: 0,userId,rating,movieId
0,airf,2,136900
1,airf,10,163788
2,airf,10,174065
3,nanw,10,154667
4,nanw,10,136900
5,nanw,10,145335
6,nanw,9,120141
7,nanw,9,54704
8,zxcv,7,113351
9,zxcv,10,86507


In [31]:
result.loc[5121] = ['zxcv',1,174065]
result.loc[5122] = ['kktw',7,47701]
result.loc[5123] = ['kktw',6,62328]
result.tail(3)

Unnamed: 0,userId,rating,movieId
5121,zxcv,1,174065
5122,kktw,7,47701
5123,kktw,6,62328


#### final_rating.csv 파일로 저장

In [32]:
result.to_csv('./data/final_rating.csv')