In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from bing_image_downloader import downloader

## Scraping

In [21]:
# Scrape multiple pages at once
url_base = 'https://mydramalist.com/search?adv=people&na=3&gd=77&so=popular&page='

heart_list = []
name_list = []
pic_list = []

for i in range(1,50):
    req = requests.get(url_base+str(i))
    
    # transform html to BeautifulSoup object
    soup = BeautifulSoup(req.text,'lxml')
    
    # get name list on the page
    names = soup.find_all("h6", class_="text-primary title")
    for i in range(0,len(names)):
        name_list.append(names[i].text)
    
    # extract the list of hearts on the page
    hearts = soup.find_all("span", class_="like-cntb")
    # Extract numbers as strings
    numbers = [re.findall(r'[0-9,]+', str(heart.text))[0] for heart in hearts]
    # Convert strings to integers
    integer_numbers = [int(num.replace(',', '')) for num in numbers]
    for integer_number in integer_numbers:
        heart_list.append(integer_number)
    
    # extract pic links
    pic_links = soup.find_all("img", class_="img-responsive cover lazy")
    # Extract pic links as strings
    pic_links_list = [re.findall(r'https://.*\.jpg', str(link))[0] for link in pic_links]
    for pic in pic_links_list:
        pic_list.append(pic)

In [22]:
# check if number of hearts is extracted
heart_list

[14094,
 21065,
 17145,
 14810,
 14528,
 12727,
 9482,
 5966,
 3273,
 3978,
 6763,
 10962,
 12772,
 6032,
 4066,
 10158,
 14887,
 11591,
 5136,
 7581,
 3270,
 7217,
 12598,
 11705,
 4223,
 7924,
 5083,
 4341,
 6512,
 3042,
 7723,
 3378,
 6518,
 2787,
 4363,
 5273,
 4273,
 2816,
 7169,
 4619,
 4521,
 3428,
 2310,
 1546,
 5189,
 3588,
 1576,
 4006,
 5097,
 2699,
 4095,
 4014,
 4085,
 1419,
 1811,
 2532,
 3889,
 2116,
 5256,
 2584,
 1958,
 1914,
 1007,
 1666,
 4505,
 1365,
 1626,
 1161,
 2434,
 933,
 1904,
 1058,
 485,
 4915,
 1537,
 2225,
 880,
 779,
 2245,
 7838,
 4440,
 4315,
 1096,
 3699,
 2829,
 1542,
 1082,
 2170,
 781,
 3029,
 1592,
 575,
 2690,
 1072,
 1105,
 4468,
 1104,
 2019,
 4902,
 1322,
 2059,
 1073,
 2512,
 1976,
 6409,
 549,
 1033,
 1993,
 1063,
 2152,
 1852,
 711,
 2416,
 606,
 3213,
 1081,
 1113,
 1782,
 2584,
 249,
 1240,
 816,
 436,
 1117,
 1070,
 689,
 1209,
 1467,
 839,
 767,
 425,
 1756,
 680,
 827,
 1439,
 878,
 263,
 920,
 484,
 313,
 1011,
 1345,
 1191,
 1149,
 1

In [23]:
# check if names is extracted
name_list

['Lee Min Ho',
 'Lee Jong Suk',
 'Ji Chang Wook',
 'Song Joong Ki',
 'Kim Soo Hyun',
 'Lee Joon Gi',
 'Kim Woo Bin',
 'Kim Myung Soo',
 'Kim Hyun Joong',
 'Jang Geun Suk',
 'So Ji Sub',
 'Park Bo Gum',
 'Nam Joo Hyuk',
 'Park Hae Jin',
 'Jung Il Woo',
 'Seo In Guk',
 'Park Seo Joon',
 'Gong Yoo',
 'Kim Bum',
 'Yoo Seung Ho',
 'Rain',
 'Ji Sung',
 'Lee Dong Wook',
 'Park Hyung Sik',
 'Sung Hoon',
 'Lee Seung Gi',
 'Choi Min Ho',
 'Lee Hyun Woo',
 'Doh Kyung Soo',
 'Jung Yong Hwa',
 'Hyun Bin',
 'Zo In Sung',
 'Kim Ji Soo',
 'Joo Won',
 'Jo Jung Suk',
 'Yook Sung Jae',
 'Jang Hyuk',
 'Lee Kwang Soo',
 'Seo Kang Joon',
 'Namkoong Min',
 'Ahn Jae Hyun',
 'Choi Si Won',
 'Lee Won Keun',
 'Park Yoo Chun',
 'Ok Taec Yeon',
 'Park Chan Yeol',
 'Noh Min Woo',
 'Jung Kyung Ho',
 'Kang Ha Neul',
 'Song Seung Heon',
 'Yoon Shi Yoon',
 'Kim Young Kwang',
 'Yoo Ah In',
 'Joo Sang Wook',
 'Kim Jae Joong',
 'Sung Joon',
 'Choi Jin Hyuk',
 'Yeon Woo Jin',
 'Yeo Jin Goo',
 'Lee Hong Ki',
 'Hong Jong Hyu

In [43]:
len(name_list)

980

In [24]:
# check if links to actor pictures are extracted
pic_list

['https://i.mydramalist.com/kEpQwv.jpg',
 'https://i.mydramalist.com/eLBmQ_5v.jpg',
 'https://i.mydramalist.com/ZyyEJ_5v.jpg',
 'https://i.mydramalist.com/1kymd_5v.jpg',
 'https://i.mydramalist.com/WonJO_5v.jpg',
 'https://i.mydramalist.com/67r8d_5v.jpg',
 'https://i.mydramalist.com/RbYgo_5v.jpg',
 'https://i.mydramalist.com/w1l7J_5v.jpg',
 'https://i.mydramalist.com/XKplxv.jpg',
 'https://i.mydramalist.com/rbpd2v.jpg',
 'https://i.mydramalist.com/p0pErv.jpg',
 'https://i.mydramalist.com/p3Qx8_5v.jpg',
 'https://i.mydramalist.com/250rk_5v.jpg',
 'https://i.mydramalist.com/kBpEr_5v.jpg',
 'https://i.mydramalist.com/qd202_5v.jpg',
 'https://i.mydramalist.com/j64Ry_5v.jpg',
 'https://i.mydramalist.com/j8meyv.jpg',
 'https://i.mydramalist.com/kwW2w_5v.jpg',
 'https://i.mydramalist.com/vZxrW_5v.jpg',
 'https://i.mydramalist.com/BDveR_5v.jpg',
 'https://i.mydramalist.com/Rm65g_5v.jpg',
 'https://i.mydramalist.com/E55lEp_5v.jpg',
 'https://i.mydramalist.com/jQQJvv_5v.jpg',
 'https://i.mydrama

In [26]:
# switch the last alphabet before .jpg to get higher resolution picture links
pic_list_highres = [url.replace('v.jpg', 'c.jpg') for url in pic_list]
pic_list_highres

['https://i.mydramalist.com/kEpQwc.jpg',
 'https://i.mydramalist.com/eLBmQ_5c.jpg',
 'https://i.mydramalist.com/ZyyEJ_5c.jpg',
 'https://i.mydramalist.com/1kymd_5c.jpg',
 'https://i.mydramalist.com/WonJO_5c.jpg',
 'https://i.mydramalist.com/67r8d_5c.jpg',
 'https://i.mydramalist.com/RbYgo_5c.jpg',
 'https://i.mydramalist.com/w1l7J_5c.jpg',
 'https://i.mydramalist.com/XKplxc.jpg',
 'https://i.mydramalist.com/rbpd2c.jpg',
 'https://i.mydramalist.com/p0pErc.jpg',
 'https://i.mydramalist.com/p3Qx8_5c.jpg',
 'https://i.mydramalist.com/250rk_5c.jpg',
 'https://i.mydramalist.com/kBpEr_5c.jpg',
 'https://i.mydramalist.com/qd202_5c.jpg',
 'https://i.mydramalist.com/j64Ry_5c.jpg',
 'https://i.mydramalist.com/j8meyc.jpg',
 'https://i.mydramalist.com/kwW2w_5c.jpg',
 'https://i.mydramalist.com/vZxrW_5c.jpg',
 'https://i.mydramalist.com/BDveR_5c.jpg',
 'https://i.mydramalist.com/Rm65g_5c.jpg',
 'https://i.mydramalist.com/E55lEp_5c.jpg',
 'https://i.mydramalist.com/jQQJvv_5c.jpg',
 'https://i.mydrama

In [27]:
# make a dataframe of the above 3 lists
df1 = pd.DataFrame(name_list, columns=['name'])
df2 = pd.DataFrame(heart_list, columns=['hearts'])
df3 = pd.DataFrame(pic_list_highres, columns=['pic'])

df980 = pd.concat([df1, df2, df3], axis=1)

In [28]:
df980

Unnamed: 0,name,hearts,pic
0,Lee Min Ho,14094,https://i.mydramalist.com/kEpQwc.jpg
1,Lee Jong Suk,21065,https://i.mydramalist.com/eLBmQ_5c.jpg
2,Ji Chang Wook,17145,https://i.mydramalist.com/ZyyEJ_5c.jpg
3,Song Joong Ki,14810,https://i.mydramalist.com/1kymd_5c.jpg
4,Kim Soo Hyun,14528,https://i.mydramalist.com/WonJO_5c.jpg
...,...,...,...
975,Ajoo,9,https://i.mydramalist.com/gO16vc.jpg
976,Lee Jung Hoon,1,https://i.mydramalist.com/rqYyyc.jpg
977,Son Seong Jun,26,https://i.mydramalist.com/ZeDx8_5c.jpg
978,Cha Hyun Woo,4,https://i.mydramalist.com/WrEjRc.jpg


In [50]:
# check if there are any duplicated names
df980['name'].duplicated().value_counts()

False    965
True      15
Name: name, dtype: int64

In [51]:
# make a new dataframe, keeping the first entry with higher hearts (hearts are generally in descending order)
df965 = df980.drop_duplicates(subset=['name'], keep='first')
df965

Unnamed: 0,name,hearts,pic
0,Lee Min Ho,14094,https://i.mydramalist.com/kEpQwc.jpg
1,Lee Jong Suk,21065,https://i.mydramalist.com/eLBmQ_5c.jpg
2,Ji Chang Wook,17145,https://i.mydramalist.com/ZyyEJ_5c.jpg
3,Song Joong Ki,14810,https://i.mydramalist.com/1kymd_5c.jpg
4,Kim Soo Hyun,14528,https://i.mydramalist.com/WonJO_5c.jpg
...,...,...,...
975,Ajoo,9,https://i.mydramalist.com/gO16vc.jpg
976,Lee Jung Hoon,1,https://i.mydramalist.com/rqYyyc.jpg
977,Son Seong Jun,26,https://i.mydramalist.com/ZeDx8_5c.jpg
978,Cha Hyun Woo,4,https://i.mydramalist.com/WrEjRc.jpg


In [None]:
# Save dataframe to a csv file for continuation
# df965.to_csv('data/korean_male_actors (965).csv', index=False)

In [3]:
df965 = pd.read_csv('data/korean_male_actors (965).csv')

In [4]:
# download the pictures of actors
for a,b in zip(df965.pic, df965.name):
    response = requests.get(a)
    open(f"{b}.jpg", "wb").write(response.content)

In [33]:
# print a list of actor pictures that have been downloaded

# import os

# def get_filenames_in_directory(directory_path):
#     filenames = os.listdir(directory_path)
#     return filenames

# file_list = get_filenames_in_directory("pics")
# print(file_list)

['Ahn Bo Hyun.jpg', 'Ahn Do Kyu.jpg', 'Ahn Hyung Joon.jpg', 'Ahn Jae Hong.jpg', 'Ahn Jae Hyo.jpg', 'Ahn Jae Hyun.jpg', 'Ahn Jae Min.jpg', 'Ahn Jae Mo.jpg', 'Ahn Jae Wook.jpg', 'Ahn Ji Hoon.jpg', 'Ahn Jung Hoon.jpg', 'Ahn Kil Kang.jpg', 'Ahn Nae Sang.jpg', 'Ahn Se Ha.jpg', 'Ahn Suk Hwan.jpg', 'Ahn Sung Ki.jpg', 'Ahn Woo Yeon.jpg', 'Ahn Yong Joon.jpg', 'Ajoo.jpg', 'Andy.jpg', 'B-Bomb.jpg', 'Bae Gun Woo.jpg', 'Bae Jin Woong.jpg', 'Bae Soo Bin.jpg', 'Bae Sung Woo.jpg', 'Bae Yong Joon.jpg', 'Baek  Yoon Shik.jpg', 'Baek Chul Min.jpg', 'Baek Do Bin.jpg', 'Baek Hyun.jpg', 'Baek Jong Min.jpg', 'Baek Jong Won.jpg', 'Baek Seung Do.jpg', 'Baek Seung Heon.jpg', 'Baek Seung Hwan.jpg', 'Baek Seung Hyun.jpg', 'Baek Sung Hyun.jpg', 'Bang Yong Guk.jpg', 'Bong Tae Kyu.jpg', 'Bu Bae.jpg', 'Byun Baek Hyun.jpg', 'Byun Hee Bong.jpg', 'Byun Joon Suk.jpg', 'Byun Woo Min.jpg', 'Byun Yo Han.jpg', 'Cha Bo Sung.jpg', 'Cha Do Jin.jpg', 'Cha Eun Woo.jpg', 'Cha Hak Yeon.jpg', 'Cha Hyun Woo.jpg', 'Cha In Pyo.jpg', 'Ch

In [41]:
# len(file_list)

965

In [34]:
# file_list_name_only = [url.replace('.jpg', '') for url in file_list]
# file_list_name_only

['Ahn Bo Hyun',
 'Ahn Do Kyu',
 'Ahn Hyung Joon',
 'Ahn Jae Hong',
 'Ahn Jae Hyo',
 'Ahn Jae Hyun',
 'Ahn Jae Min',
 'Ahn Jae Mo',
 'Ahn Jae Wook',
 'Ahn Ji Hoon',
 'Ahn Jung Hoon',
 'Ahn Kil Kang',
 'Ahn Nae Sang',
 'Ahn Se Ha',
 'Ahn Suk Hwan',
 'Ahn Sung Ki',
 'Ahn Woo Yeon',
 'Ahn Yong Joon',
 'Ajoo',
 'Andy',
 'B-Bomb',
 'Bae Gun Woo',
 'Bae Jin Woong',
 'Bae Soo Bin',
 'Bae Sung Woo',
 'Bae Yong Joon',
 'Baek  Yoon Shik',
 'Baek Chul Min',
 'Baek Do Bin',
 'Baek Hyun',
 'Baek Jong Min',
 'Baek Jong Won',
 'Baek Seung Do',
 'Baek Seung Heon',
 'Baek Seung Hwan',
 'Baek Seung Hyun',
 'Baek Sung Hyun',
 'Bang Yong Guk',
 'Bong Tae Kyu',
 'Bu Bae',
 'Byun Baek Hyun',
 'Byun Hee Bong',
 'Byun Joon Suk',
 'Byun Woo Min',
 'Byun Yo Han',
 'Cha Bo Sung',
 'Cha Do Jin',
 'Cha Eun Woo',
 'Cha Hak Yeon',
 'Cha Hyun Woo',
 'Cha In Pyo',
 'Cha Kwang Soo',
 'Cha Seung Joon',
 'Cha Seung Won',
 'Cha Sun Woo',
 'Cha Tae Hyun',
 'Chae Sang Woo',
 'Chang Jae',
 'Chang Jo',
 'Chang Min',
 'Chani',


In [44]:
# Check if any actors' pictures have not been downloaded

# not_downloaded = []
# for ind in name_list:
#     if ind not in file_list_name_only:
#         not_downloaded.append(ind)
# not_downloaded

['Lee Min Ho',
 'Lee Jong Suk',
 'Ji Chang Wook',
 'Song Joong Ki',
 'Kim Soo Hyun',
 'Lee Joon Gi',
 'Kim Woo Bin',
 'Kim Myung Soo',
 'Kim Hyun Joong',
 'Jang Geun Suk',
 'So Ji Sub',
 'Park Bo Gum',
 'Nam Joo Hyuk',
 'Park Hae Jin',
 'Jung Il Woo',
 'Seo In Guk',
 'Park Seo Joon',
 'Gong Yoo',
 'Kim Bum',
 'Yoo Seung Ho',
 'Rain',
 'Ji Sung',
 'Lee Dong Wook',
 'Park Hyung Sik',
 'Sung Hoon',
 'Lee Seung Gi',
 'Choi Min Ho',
 'Lee Hyun Woo',
 'Doh Kyung Soo',
 'Jung Yong Hwa',
 'Hyun Bin',
 'Zo In Sung',
 'Kim Ji Soo',
 'Joo Won',
 'Jo Jung Suk',
 'Yook Sung Jae',
 'Jang Hyuk',
 'Lee Kwang Soo',
 'Seo Kang Joon',
 'Namkoong Min',
 'Ahn Jae Hyun',
 'Choi Si Won',
 'Lee Won Keun',
 'Park Yoo Chun',
 'Ok Taec Yeon',
 'Park Chan Yeol',
 'Noh Min Woo',
 'Jung Kyung Ho',
 'Kang Ha Neul',
 'Song Seung Heon',
 'Yoon Shi Yoon',
 'Kim Young Kwang',
 'Yoo Ah In',
 'Joo Sang Wook',
 'Kim Jae Joong',
 'Sung Joon',
 'Choi Jin Hyuk',
 'Yeon Woo Jin',
 'Yeo Jin Goo',
 'Lee Hong Ki',
 'Hong Jong Hyu