# Content Crawler

This crawler will be the next step of `list_crawler.ipynb`.

When given a csv file, such as `kpop_agenda/Step2/DCInside/dcinside_마이카_list.csv`,

It will crawl through all the URLs in the `URL` column of the csv file, then store the article contents in a text file.

The folder location where the text files will be stored is `kpop_agenda/Step2/DCInside/Articles`.

The file names for the articles will be stored as `{keyword}-YYYYMMDD-HHMM-{number}.txt` if there is another article with the same keyword at the same minute, then we use the number at the end. Usually it will be 1, but sometimes they may have been multiple articles at the same minute.

Also, if there are multiple articles with the exact same title within 10 minutes, we consider that as one person "spamming" across many galleries, and we ignore them except for the initial one.

Lastly, we update the `kpop_agenda/Step2/DCInside/dcinside_마이카_list.csv` csv file by adding a column at the end, `location`. This is where the article text file is located at, like `kpop_agenda/Step2/DCInside/Articles/마이카-20251228-0112-1.txt`.

In [None]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import os
import time
from datetime import datetime, timedelta

In [None]:
INPUT_CSV_PATH = 'C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step2/DCInside/dcinside_마이카_list.csv'
OUTPUT_DIR = 'C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step2/DCInside/Articles'
CONCURRENT_REQUESTS = 10

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

async def fetch_and_save(session, row, semaphore):
    url = row['URL']
    file_path = row['target_path']
    
    async with semaphore:
        try:
            async with session.get(url, headers=HEADERS, timeout=15) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    
                    # 1. Extract Content
                    content_div = soup.find('div', class_='write_div')
                    if not content_div:
                        content_div = soup.find('div', class_='thum-txt') # Fallback
                    
                    if content_div:
                        content = content_div.get_text('\n', strip=True)
                        
                        # 2. Clean Content ("- dc official App" removal)
                        lines = content.split('\n')
                        if lines and lines[-1].strip() == "- dc official App":
                            lines.pop()
                            content = '\n'.join(lines)

                        with open(file_path, 'w', encoding='utf-8') as f:
                            f.write(content)
                        
                        return True
                    else:
                        print(f"  [Warning] No content found: {url}")
                        return False
                else:
                    print(f"  [Error] Status {response.status}: {url}")
                    return False
        except Exception as e:
            print(f"  [Exception] {url}: {e}")
            return False

async def main():
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"Error: File not found at {INPUT_CSV_PATH}")
        return

    df = pd.read_csv(INPUT_CSV_PATH)
    print(f"Loaded {len(df)} rows. Pre-processing for spam and filenames...")

    # Filter spam & generate filenames
    df['dt'] = pd.to_datetime(df['time'], format='%Y.%m.%d %H:%M')
    df = df.sort_values('dt', ascending=True)

    valid_rows = []
    seen_titles = {}
    file_counters = {}
    
    for index, row in df.iterrows():
        title = row['title']
        current_dt = row['dt']
        keyword = row['keyword']

        if title in seen_titles:
            last_seen_dt = seen_titles[title]
            if (current_dt - last_seen_dt) <= timedelta(minutes=3): # Might have to adjust this time
                continue
        seen_titles[title] = current_dt

        time_str = current_dt.strftime('%Y%m%d-%H%M')
        key = (keyword, time_str)
        file_counters[key] = file_counters.get(key, 0) + 1
        
        filename = f"{keyword}-{time_str}-{file_counters[key]}.txt"
        file_path = os.path.join(OUTPUT_DIR, filename)
        
        row['target_path'] = file_path
        row['location'] = file_path
        valid_rows.append(row)

    print(f"Filtered down to {len(valid_rows)} valid articles. Starting crawl...")

    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    tasks = []
    
    async with aiohttp.ClientSession() as session:
        for row in valid_rows:
            task = asyncio.create_task(fetch_and_save(session, row, semaphore))
            tasks.append(task)
        
        try:
            from tqdm.asyncio import tqdm
            results = await tqdm.gather(*tasks)
        except ImportError:
            results = await asyncio.gather(*tasks)
            print("Finished crawling.")

    final_df = pd.DataFrame(valid_rows)
    
    if 'dt' in final_df.columns:
        final_df = final_df.drop(columns=['dt'])
    if 'target_path' in final_df.columns: 
        final_df = final_df.drop(columns=['target_path'])

    final_df.to_csv(INPUT_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f"\nCompleted. {sum(results)} articles successfully saved.")
    print(f"Updated CSV saved to {INPUT_CSV_PATH}.")

await main()

Loaded 230 rows. Pre-processing for spam and filenames...
Filtered down to 230 valid articles. Starting async crawl...


100%|██████████| 230/230 [00:46<00:00,  4.99it/s]


Completed! 230 articles successfully saved.
Updated CSV saved to C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step2/DCInside/dcinside_마이카_list.csv



