# List Crawler
This code aims to

1. Given a specific time span and keyword, retrive all URLs of the articles in DCInside.

## URL format
The URL for DC Inside is like this.

https://search.dcinside.com/post/p/(PAGE_NUMBER)/sort/latest/q/, then the UTF-8 incoding for the search follows.

Each character is separated by ".".

For example, if we search "카리나", the URL is like this.

https://search.dcinside.com/post/p/(PAGE_NUMBER)/sort/latest/q/.EC.B9.B4.EB.A6.AC.EB.82.98

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import quote
from datetime import datetime


In [2]:
KEYWORD = "마이카"
START_DATE_STR = "2025-12-27-00-00"  # YYYY-MM-DD-HH-MM
END_DATE_STR = "2025-12-29-13-25"    # YYYY-MM-DD-HH-MM

# Output filename
OUTPUT_FILE = f"dcinside_{KEYWORD}_results.csv"

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def get_custom_encoded_keyword(keyword):
    """
    Converts '카리나' -> '.EC.B9.B4.EB.A6.AC.EB.82.98'
    Standard URL encode uses %, DC Inside uses . for some paths.
    """
    encoded = quote(keyword.encode('utf-8')).replace('%', '.')
    return encoded

def parse_date_arg(date_str):
    """Parses input string 'YYYY-MM-DD-HH-MM' to datetime object."""
    return datetime.strptime(date_str, "%Y-%m-%d-%H-%M")

def parse_article_date(date_text):
    """Parses article date '2025.12.28 10:41' to datetime object."""
    # The site format in your screenshot is YYYY.MM.DD HH:MM
    return datetime.strptime(date_text, "%Y.%m.%d %H:%M")

In [None]:
def scrape_dcinside():
    start_dt = parse_date_arg(START_DATE_STR)
    end_dt = parse_date_arg(END_DATE_STR)
    
    encoded_keyword = get_custom_encoded_keyword(KEYWORD)
    
    with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(['keyword', 'time', 'title', 'URL'])
        
        page = 1
        stop_scraping = False
        
        print(f"Starting scrape for '{KEYWORD}' from {START_DATE_STR} to {END_DATE_STR}...")

        while not stop_scraping:
            url = f"https://search.dcinside.com/post/p/{page}/sort/latest/q/{encoded_keyword}"
            print(f"Scanning Page {page}: {url}")
            
            try:
                response = requests.get(url, headers=HEADERS)
                if response.status_code != 200:
                    print(f"Error: Status code {response.status_code}")
                    break
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                result_list = soup.select('ul.sch_result_list > li')
                
                if not result_list:
                    print("No more results found.")
                    break
                
                for li in result_list:

                    date_element = li.select_one('.link_dsc_txt.dsc_sub .date_time')
                    if not date_element:
                        continue
                        
                    article_date_str = date_element.get_text(strip=True)
                    try:
                        article_dt = parse_article_date(article_date_str)
                    except ValueError:
                        print(f"Skipping date parse error: {article_date_str}")
                        continue

                    if article_dt > end_dt:
                        continue
                        
                    if article_dt < start_dt:
                        print(f"Reached date {article_dt}, which is older than start time. Stopping.")
                        stop_scraping = True
                        break
                    
                    link_tag = li.select_one('a.tit_txt')
                    if link_tag:
                        title = link_tag.get_text(strip=True)
                        article_url = link_tag['href']
                        
                        writer.writerow([KEYWORD, article_date_str, title, article_url])
            
            except Exception as e:
                print(f"An error occurred: {e}")
                break
            
            # Move to next page
            page += 1
            
            # Be polite to the server
            time.sleep(0.5)

    print("Scraping finished.")



In [5]:
if __name__ == "__main__":
    scrape_dcinside()

Starting scrape for '마이카' from 2025-12-27-00-00 to 2025-12-29-13-25...
Scanning Page 1: https://search.dcinside.com/post/p/1/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 2: https://search.dcinside.com/post/p/2/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 3: https://search.dcinside.com/post/p/3/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 4: https://search.dcinside.com/post/p/4/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 5: https://search.dcinside.com/post/p/5/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 6: https://search.dcinside.com/post/p/6/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 7: https://search.dcinside.com/post/p/7/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 8: https://search.dcinside.com/post/p/8/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 9: https://search.dcinside.com/post/p/9/sort/latest/q/.EB.A7.88.EC.9D.B4.EC.B9.B4
Scanning Page 10: https://search.dcinside.com/post/p/10/sort/late