In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

## **Accessing web and scraping**
1. Access [allkpop](https://www.allkpop.com/) through Selenium
2. Scroll until Selenium reaches the 100th scrolls in the web
3. Load the html and scrape

In [2]:
# setup for accessing allkpop.com
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

driver.get("https://www.allkpop.com/")

In [3]:
# scroll 100 times with a javascript script

tic = time.time()
number_of_scrolls = 100
for _ in range(number_of_scrolls):
    # Scroll down to the bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    time.sleep(2)

toc = time.time()
runtime = toc - tic
print(f"Time taken: {runtime:.2f} sec")

Time taken: 205.57 sec


In [4]:
# scrape by class tags with bf4

html = driver.page_source  # the entire html loaded by the scrolls 
soup = BeautifulSoup(html, 'html.parser')

tic = time.time()

titles = soup.find_all(class_='title')
info = soup.find_all(class_="info")
categories = soup.find_all(class_="category button cate_btn")

toc = time.time()

runtime = toc - tic
print(f"Time taken: {runtime:.2f} sec")

Time taken: 1.19 sec


## **Preprocessing raw data scraped from allkpop.com**
1. date conversions
2. allocating data to the corresponding columns via Pandas

In [5]:
# just in case, deep copy the scraped data
title_lst = [title.text.strip() for title in titles]
info_lst = [info_.text.strip() for info_ in info]
cat_lst = [cat.text.strip() for cat in categories]

title_copy = title_lst
info_copy = info_lst
cat_copy = cat_lst

In [6]:
# checking the length of each lst
print(f"length of info list:{len(info_copy)}")
print(f"length of title list:{len(title_copy)}")
print(f"length of category list:{len(cat_copy)}")

length of info list:1540
length of title list:1555
length of category list:1540


In [7]:
# pre-formatting before date conversion
def formatting_data(lst):
    temp_lst = lst
    
    for i in range(len(temp_lst)):
        temp_lst[i] = temp_lst[i].split()
        
        if len(temp_lst[i]) == 6:
            joined_lst = [' '.join(temp_lst[i][1:4])]
            temp_lst[i][1:4] = joined_lst
        
        if ('hour' in temp_lst[i][1]) or ('minute' in temp_lst[i][1]):
            temp_lst[i][1] = '3/18/2024'
    
    return temp_lst

In [8]:
pre_format_lst = formatting_data(info_copy)

In [9]:
# coverting dates to '%m/%d/%Y' format
def convert_date_format(entry):
    try:
        if len(entry[1].split(',')) == 2:  # Likely in "Day, Month DD, YYYY" format
            date_str = ' '.join(entry[1:5])  # Reconstruct the date string
            date_obj = datetime.strptime(date_str, "%A, %B %d, %Y")
            new_date = date_obj.strftime('%m/%d/%Y')
            return [entry[0], new_date] + entry[5:]
    except ValueError:
        pass  
    
    start_date = datetime.now()
    
    if 'days ago' in entry[1]:
        days_ago = int(entry[1].split()[0])
        actual_date = start_date - timedelta(days=days_ago)
        return [entry[0], actual_date.strftime('%m/%d/%Y')] + entry[2:]
    elif 'day ago' in entry[1]:
        actual_date = start_date - timedelta(days=1)
        return [entry[0], actual_date.strftime('%m/%d/%Y')] + entry[2:]
    
    return entry

In [10]:
# Process the data to standardize date format
processed_data = [convert_date_format(entry) for entry in pre_format_lst]

In [12]:
processed_data[1500]

['Sophie-Ha', '02/15/2024', '11', '6,388']

In [13]:
scraped_df = {
    'title': title_copy[:1540],
    'category': cat_copy}

In [14]:
# allocating all the elements in the nested list to key-value pairs

def process_insert_df(lst, dict_):
    temp_dict = dict_
    keys = ['author_name', 'dates', 'num comments', 'num views']
    # Use zip(*) to unpack and transpose lst
    for key, values in zip(keys, zip(*lst)):
        temp_dict[key] = list(values)
    return temp_dict

In [16]:
final_dict = process_insert_df(processed_data, scraped_df)

In [17]:
# putting all together as a DataFrame
processed_df = pd.DataFrame(final_dict)
processed_df

Unnamed: 0,title,category,author_name,dates,num comments,num views
0,Stray Kids' Seungmin delivers a perfect strike...,Misc,EunhaYi,3/18/2024,0,582
1,‘Chicken Nugget’ dominates integrated content ...,News,EunhaYi,3/18/2024,1,1327
2,"Singer Jung Joon Young, convicted in sex video...",News,EunhaYi,3/18/2024,32,5499
3,DAY6's latest hit 'Welcome to the Show' tops m...,News,EunhaYi,3/18/2024,9,1294
4,OMEGA X provides legal update on lawsuit again...,News,EunhaYi,3/18/2024,4,1124
...,...,...,...,...,...,...
1535,Pentagon's Wooseok takes to great outdoors in ...,News,Germaine-Jay,02/15/2024,2,1503
1536,ICHILLIN' reveal receipt for 'Feelin' Hot' com...,News,Germaine-Jay,02/15/2024,1,1541
1537,n.SSign remember the good times in 'Happy &' MV,Music Video,Germaine-Jay,02/15/2024,0,1481
1538,iKON's DK (Donghyuk) gets his heart stolen in ...,Music Video,Germaine-Jay,02/15/2024,4,2938
