# Import Library

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Scrape

## KomikCast

In [1]:
def getdataurl(url):
    data = []
    response = requests.get(url) # Send a GET request to the URL
    soup = BeautifulSoup(response.content, 'html.parser')
    comics = soup.find_all('div', class_='list-update_item')
    for comic in comics:
        # raw data comics
        title = comic.find('h3', class_='title').text
        img = comic.find('img', class_='ts-post-image').get('src')
        raw_rate = comic.find('div', class_='numscore').text.replace(',', '.').replace('..', '.').strip()
        rate = float(raw_rate) if re.match(r'^\d+(\.\d+)?$', raw_rate) else 0.0

        type = comic.find('span', class_='type').text
        
        # sub-comics to get details
        soup = BeautifulSoup((requests.get(comic.find('a').get('href'))).content, 'html.parser')
        raw_description = soup.find('div', class_="komik_info-description-sinopsis").text.strip()
        description = re.sub(r'[^a-zA-Z0-9\s]', '', raw_description)
        alt_title = soup.find('span',class_='komik_info-content-native').text.strip()
        released = (soup.find('span', class_='komik_info-content-info-release').text.strip()).replace('Released:', '').strip() 
        author = (soup.find('span', class_='komik_info-content-info').text.strip()).replace('Author:', '').strip()
        
        # Extract genres
        raw_genre = soup.find('span', class_='komik_info-content-genre')
        if raw_genre:
            genres = [a.text.strip() for a in raw_genre.find_all('a')]
            genre = ', '.join(genres)
        else:
            genre = ""
        
        data.append({
                'title': title,
                'alt_title': alt_title,
                'type': type,
                'description': description,
                'genre': genre,
                'author':author,
                'artist':"-",
                'rate': rate,
                'image': img,
                'released': released,
            })
    return data

In [10]:
url = 'https://komikcast.cz/daftar-komik'
# https://komikcast.cz/daftar-komik/page/1/

response = requests.get(url)

if response.status_code == 200:
    json = []
    url += "/page"
    soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content of the page
    comics = soup.find_all('div', class_='list-update_item')
    last_page = int((soup.find_all('a', class_='page-numbers')[-2]).text) + 1
    # last_page = 100
    
    # get data manually 300 page's
    indexNum = 201 # 100 # 0
    last_page = 315 # 201 # 100
    
    for i in range(indexNum,last_page):
        url_page = url + "/" + str(i) # Build the URL for the current page
        getdata = getdataurl(url_page)
        json = json + getdata
    df = pd.DataFrame(json)
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

In [11]:
df.head(5)

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,Boku no Namae wa “Shounen A”,"My Name Is ""Boy A"", 僕の名前は｢少年A｣",Manga,Ini tentang seorang anak baikbaik bernama Taka...,"Drama, Mature, Romance, School Life, Shounen, ...","Kimizuka Chikara, Yen Hioka",-,8.15,https://komikcast.cz/wp-content/uploads/2020/1...,"Oct 23, 2017"
1,Dain Ironworks,-,Manhwa,Geografi Candi Goryeosa vol 10 Orangorang dari...,"Action, Fantasy",-,-,7.3,https://komikcast.cz/wp-content/uploads/2021/0...,2021
2,Stepping Through The Fairy River (Ta Sui Xian He),,Manhua,Ketika Qin Lie menyadari bahwa martabat hanya ...,"Action, Martial Arts",,-,7.0,https://komikcast.cz/wp-content/uploads/2019/1...,2019
3,What can the eldest lady have?,,Manhua,Setelah merencanakan selama bertahuntahun putr...,"Adventure, Romance",Zan Wu,-,7.0,?w=225&q=50,2021
4,Close Mad Doctor,-,Manhua,Master tak tertandingi yang keluar dari pegunu...,"Action, Adventure, Ecchi, Romance",Qiān huì dòngmàn yīzhí kàn,-,7.45,https://komikcast.cz/wp-content/uploads/2021/0...,?


In [12]:
df.to_csv('data/komikcast.csv', index=False, encoding='utf-8')