In [2]:
# 한국 전체 작품 데이터 수집
import pandas as pd

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from time import sleep
import time

In [6]:
# 1 80여개 작품 이름, 요일(1~7), 순위 구하기 

URL = 'https://comic.naver.com/webtoon/weekdayList?week='
weeks = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]

month_titles = []
star_means = []

for week in weeks:
    day_titles = []
    titles = []
    day_star_means = []
    stars = []
    
    html = requests.get(URL + week).text
    soup = BeautifulSoup(html,'html.parser')
    titles = soup.select("#content > div.list_area.daily_img > ul > li > dl > dt > a")
    stars = soup.select("#content > div.list_area.daily_img > ul > li > dl > dd:nth-child(3) > div > strong")
    
    day_titles = [title.text for title in titles[:80]] # 80여개 작품만 선정
    day_star_means = [float(star.text) for star in stars[:80]] # float으로 변환
    
    month_titles.append(day_titles)
    star_means.extend(day_star_means)

In [7]:
for week,week_titles in zip(weeks,month_titles):
    print(f"{week}: {len(week_titles)}")

mon: 80
tue: 80
wed: 80
thu: 80
fri: 80
sat: 80
sun: 80


In [8]:
# 2 작품의 대장르, 소장르, 하트수 구하기

from selenium.webdriver.common.by import By

URL = 'https://comic.naver.com/webtoon/weekdayList?week='
driver = webdriver.Chrome('chromedriver.exe')
large_genre_list = []; small_genre_list = []; heart_list = [];

for week in weeks:
    driver.get(URL+week)
    time.sleep(1)
    
    for i in range(1,81):
        genre = []
        
        # 각 요일 페이지에서 특정 작품으로 이동
        page = driver.find_element(by=By.CSS_SELECTOR, 
                                    value=f"#content > div.list_area.daily_img > ul > li:nth-child({i}) > div > a")
        page.click()
        time.sleep(0.5)

         #이동한 페이지 주소 읽고, 파싱하기
        html = driver.page_source
        soup = BeautifulSoup(html,'html.parser')

        #작품 장르 수집
        genre = soup.find('span',{'class':'genre'}).text.replace(",","").split() 
        #genre = soup.select("#content > div.comicinfo > div.detail > p.detail_info > span.genre").text
        #genre = genre.replace(",","").split() # '스토리, 액션' -> ["스토리","액션"]
        large_genre_list.append(genre[0])
        small_genre_list.append(genre[1])

        
        # 하트 수 수집
        heart = int(soup.find('em',{'class':'u_cnt'}).text.replace("+","").replace(",",""))
        #heart = soup.select("#content > div.comicinfo > div.detail > ul > li:nth-child(5) > div > a > em").text
        #heart = heart.replace("+","").replace(",","") # 99,999+ 에서 +와, 제거하고 숫자형으로 변환
        heart_list.append(heart)        
        
        driver.back()

  driver = webdriver.Chrome('chromedriver.exe')


In [9]:
len(large_genre_list)

560

In [10]:
# 3. 데이터프레임 제작
## 3.1 [제목, 요일, 순위] row data 제작
row_data = []
week = ""
rank = 0
titles = ""

for i, week_titles in enumerate(month_titles):
    week = weeks[i]
    for j, title in enumerate(week_titles):
        name = title.replace("...","").replace("(tng","")
        rank = j + 1
        row_data.append([name,week,rank])

In [13]:
row_data[:10]

[['참교육', 'mon', 1],
 ['뷰티풀 군바리', 'mon', 2],
 ['퀘스트지상주의', 'mon', 3],
 ['장씨세가 호위', 'mon', 4],
 ['윈드브레이커', 'mon', 5],
 ['팔이피플', 'mon', 6],
 ['신화급 귀속 ', 'mon', 7],
 ['버림받은 왕녀', 'mon', 8],
 ['퍼니게임', 'mon', 9],
 ['앵무살수', 'mon', 10]]

In [14]:
# 3.2 row, column 이 들어간 df 제작
df = pd.DataFrame(row_data,columns = ['title', 'week','rank'])

df["star_mean"] = star_means
df["large_genre"] = large_genre_list
df["small_genre"] = small_genre_list
df["heart"] = heart_list

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        560 non-null    object 
 1   week         560 non-null    object 
 2   rank         560 non-null    int64  
 3   star_mean    560 non-null    float64
 4   large_genre  560 non-null    object 
 5   small_genre  560 non-null    object 
 6   heart        560 non-null    int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 30.8+ KB


In [16]:
df.shape

(560, 7)

In [17]:
df["large_genre"].unique()

array(['스토리', '에피소드', '옴니버스'], dtype=object)

In [18]:
df["small_genre"].unique()

array(['액션', '드라마', '무협/사극', '스포츠', '판타지', '로맨스', '스릴러', '일상', '감성', '개그'],
      dtype=object)

In [19]:
df["week"].unique()

array(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'], dtype=object)

In [20]:
# 4.중복 제거
## 4.1 중복 확인
df[df.duplicated(['title'])]

Unnamed: 0,title,week,rank,star_mean,large_genre,small_genre,heart
250,호랑신랑뎐,thu,11,9.98,스토리,판타지,1033
256,루루라라 우리,thu,17,9.93,에피소드,일상,297
273,수영만화일기,thu,34,9.9,에피소드,일상,887
321,대학원 탈출일지,fri,2,9.97,에피소드,일상,1417
334,삼국지톡,fri,15,9.9,에피소드,무협/사극,99999
351,웅크,fri,32,9.97,에피소드,감성,312
372,쿠쿠쿠쿠,fri,53,9.89,스토리,개그,99
401,놓지마 정신줄,sat,2,9.97,에피소드,개그,545
404,먹는 인생,sat,5,9.96,옴니버스,일상,1953
406,여고생 드래곤,sat,7,9.97,스토리,판타지,1564


In [25]:
len(df[df.duplicated(['title'])]) 

16

In [21]:
## 4.2 중복제거한 df_undp 제작

df_undp = df.drop_duplicates(['title'],ignore_index = True)

In [23]:
df_undp.shape

(544, 7)

In [27]:
# 5. 저장
df_undp.to_excel('korea.xlsx')

In [26]:
df_undp.to_csv('korea.csv')

In [5]:
# 6. isnovel 비율

df = pd.read_csv("data/korea_add.csv")

In [6]:
df[df["rank"] >= 10]["isnovel"].value_counts()

0    346
1    139
Name: isnovel, dtype: int64

In [28]:
# etc1. 제목 자동 검색 코드 (소설 원작 여부 검색 시 빠르게 하기 위해 사용)

df_dict = df_undp.to_dict()
dic_val = df_dict["title"].values()
dic_list = list(dic_val)

browser = webdriver.Chrome()
browser.get('https://series.naver.com/search/search.series?t=all&fs=novel&q=')

for title in dic_list:
    time.sleep(0.5)
    elem = browser.find_element(by=By.ID,value='ac_input1') 
    elem.send_keys(title)
    
    elem = browser.find_element(by=By.CSS_SELECTOR, value="#ac_form1 > fieldset > button")
    elem.click()
    
    time.sleep(3)
    browser.back()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=108.0.5359.99)
Stacktrace:
Backtrace:
	(No symbol) [0x00D8F243]
	(No symbol) [0x00D17FD1]
	(No symbol) [0x00C0D04D]
	(No symbol) [0x00BF2D7A]
	(No symbol) [0x00C5BE7B]
	(No symbol) [0x00C6C196]
	(No symbol) [0x00C58386]
	(No symbol) [0x00C3163C]
	(No symbol) [0x00C3269D]
	GetHandleVerifier [0x01029A22+2655074]
	GetHandleVerifier [0x0101CA24+2601828]
	GetHandleVerifier [0x00E38C0A+619850]
	GetHandleVerifier [0x00E37830+614768]
	(No symbol) [0x00D205FC]
	(No symbol) [0x00D25968]
	(No symbol) [0x00D25A55]
	(No symbol) [0x00D3051B]
	BaseThreadInitThunk [0x7673FEF9+25]
	RtlGetAppContainerNamedObjectPath [0x77457BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77457B8E+238]


In [31]:
# etc2. 제목 자동 검색 후 회차 정보 수집 코드

df_dict = df_undp[:2].to_dict()
dic_val = df_dict["title"].values()
dic_list = list(dic_val)
ep_list = []

browser = webdriver.Chrome()
browser.get('https://comic.naver.com/webtoon/weekdayList?week=mon')

for title in dic_list:
    time.sleep(0.5)
    elem = browser.find_element(by=By.ID,value='gnb.keyword') 
    elem.send_keys(title)
    
    elem = browser.find_element(by=By.CSS_SELECTOR, value="#search_bar_button > span")
    elem.click()
    
    try:
        ep = soup.select("#content > div:nth-child(2) > ul > li > ul > li:nth-child(3) > em").getText()
        
    except:
        ep = 0
    ep_list.append(ep)
    
    time.sleep(3)
    browser.back()