# Import Library

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from PIL import Image
import uuid  # Import UUID module
import os

# Scrape

## KomikCast

In [1]:
def download_image(img_url, save_path):
    if not img_url.startswith(('http://', 'https://')):
        print(f"Invalid URL for {save_path}. Saving a gray placeholder image instead.")
        # Create a 225x225 gray image using PIL as a placeholder
        img = Image.new('RGB', (225, 225), color='gray')
        img.save(save_path)
    else:
        try:
            # Get the image from the valid URL
            get_image = requests.get(img_url)
            with open(save_path, 'wb') as file:
                file.write(get_image.content)
                
            print(f"Image downloaded for {save_path}")
        except Exception as e:
            print(f"Failed to download image for {save_path}: {e}")
            # Save a gray image in case of failure
            img = Image.new('RGB', (225, 225), color='gray')
            img.save(save_path)
            
def getdataurl(url):
    data = []
    response = requests.get(url) # Send a GET request to the URL
    soup = BeautifulSoup(response.content, 'html.parser')
    comics = soup.find_all('div', class_='list-update_item')
    for comic in comics:
        # raw data comics
        title = comic.find('h3', class_='title').text
        img = comic.find('img', class_='ts-post-image').get('src')
        raw_rate = comic.find('div', class_='numscore').text.replace(',', '.').replace('..', '.').strip()
        rate = float(raw_rate) if re.match(r'^\d+(\.\d+)?$', raw_rate) else 0.0

        type = comic.find('span', class_='type').text
        
        # sub-comics to get details
        soup = BeautifulSoup((requests.get(comic.find('a').get('href'))).content, 'html.parser')
        raw_description = soup.find('div', class_="komik_info-description-sinopsis").text.strip()
        description = re.sub(r'[^a-zA-Z0-9\s]', '', raw_description)
        alt_title = soup.find('span',class_='komik_info-content-native').text.strip()
        released = (soup.find('span', class_='komik_info-content-info-release').text.strip()).replace('Released:', '').strip() 
        author = (soup.find('span', class_='komik_info-content-info').text.strip()).replace('Author:', '').strip()
        
        # Extract genres
        raw_genre = soup.find('span', class_='komik_info-content-genre')
        if raw_genre:
            genres = [a.text.strip() for a in raw_genre.find_all('a')]
            genre = ', '.join(genres)
        else:
            genre = ""
        
        uuid_data = str(uuid.uuid4())
        data.append({
                'id': uuid_data,
                'title': title,
                'alt_title': alt_title,
                'type': type,
                'description': description,
                'genre': genre,
                'author':author,
                'artist':"-",
                'rate': rate,
                'image': img,
                'released': released,
            })
        
        if not os.path.exists('image'):
            os.makedirs('image')
            
        download_image(img, f'image/{uuid_data}.jpg')
        
    return data

In [10]:
url = 'https://komikcast.cz/daftar-komik'

response = requests.get(url)

if response.status_code == 200:
    json = []
    url += "/page"
    soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content of the page
    comics = soup.find_all('div', class_='list-update_item')
    last_page = int((soup.find_all('a', class_='page-numbers')[-2]).text) + 1
    
    indexNum = 0
    last_page = 400
    
    for start in range(indexNum, last_page, 100):
        end = min(start + 100, last_page)
        json_data = []
        for i in range(start, end):
            url_page = url + "/" + str(i)
            getdata = getdataurl(url_page)
            json_data.extend(getdata)
        
        df = pd.DataFrame(json_data)
        
        if not os.path.exists('data'):
            os.makedirs('data')
        
        df.to_csv(f'data/komikcast-{end}.csv', index=False, encoding='utf-8')
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')