# Import Library

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Code

In [31]:
def getdataurl(url):
    data = []
    response = requests.get(url) # Send a GET request to the URL
    soup = BeautifulSoup(response.content, 'html.parser')
    comics = soup.find_all('div', class_='bs')
    
    for comic in comics:
        # raw data comics
        raw_title = comic.find('div', class_='tt').text
        title = re.sub(r'[^a-zA-Z0-9\s]', '', raw_title.replace('\n', ' ').replace('\t', ' ')).strip()
        title = re.sub(r'\s+', ' ', title)
        
        img = comic.find('img', class_='ts-post-image').get('src')
        
        raw_rate = comic.find('div', class_='numscore').text.replace(',', '.').replace('..', '.').strip()
        rate = float(raw_rate) if re.match(r'^\d+(\.\d+)?$', raw_rate) else 0.0
        
        raw_type = comic.find('span', class_='type')
        type = raw_type['class'][1]
        
        # sub-comics to get details
        soup = BeautifulSoup((requests.get(comic.find('a').get('href'))).content, 'html.parser')
        raw_description = soup.find('div', class_="entry-content").text.strip()
        description = re.sub(r'[^a-zA-Z0-9\s]', '', raw_description)
        raw_alt_title = soup.find('div',class_='seriestualt')
        alt_title = raw_alt_title.text.strip() if raw_alt_title and raw_alt_title.text.strip() else ""
        
        # Extract genres
        raw_genre = soup.find('div', class_='seriestugenre')
        if raw_genre:
            genres = [a.text.strip() for a in raw_genre.find_all('a')]
            genre = ', '.join(genres)
        else:
            genre = ""        
        
        released = "-"
        author = "-"
        artist = "-"
        table = (soup.find('table', class_='infotable'))
        
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) > 1:
                key = cells[0].text.strip()
                value = cells[1].text.strip()
                if key == 'Released':
                    released = value
                elif key == 'Author':
                    author = value
                elif key == 'Artist':
                    artist = value
        
        data.append({
            'title': title,
            'alt_title': alt_title,
            'type': type,
            'description': description,
            'genre': genre,
            'author':author,
            'artist':artist,
            'rate': rate,
            'image': img,
            'released': released,
        })
    return data

In [32]:
url = 'https://westmanga.fun/manga/'

response = requests.get(url)

if response.status_code == 200:
    json = []
    url += "?page"
    
    soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content of the page
    comics = soup.find_all('div', class_='bs')
    last_page = 301
    
    # get data manually 300 page's
    indexNum = 201 # 100 # 0
    last_page = 315 # 201 # 100
    
    for i in range(indexNum,last_page):
        url_page = url + "=" + str(i) # Build the URL for the current page
        getdata = getdataurl(url_page)
        json = json + getdata
    df = pd.DataFrame(json)
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

In [33]:
df.head(5)

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,The Crazy Genius Composer Returns,,Manhwa,Sukses kehormatan keluarga Min Hyunseung bla b...,Drama,-,-,7.0,https://westmanga.fun/wp-content/uploads/2024/...,-
1,The Crow,"Chu Wu, Chǔ Wū, Clear Crow, The Crow, 楚乌",Manhua,Orangorang menjadi hantu saat meninggal dan ak...,"Action, Drama, Fantasy, Horror, Shounen",Mo Fei,-,7.11,https://westmanga.fun/wp-content/uploads/2023/...,-
2,The Crown Prince That Sells Medicine,"Crown Prince Sells Medicine, 약 파는 황태자",Manhwa,Oleh Penulis yang membawakan Anda \nSaya putra...,"Action, Adventure, Comedy, Fantasy","BK_Moon, LEE Hyun-Min",-,7.2,https://westmanga.fun/wp-content/uploads/2023/...,-
3,The Cuckoos Fiancee,"A Couple of Cuckoos, Cuckoo no Iinazuke, Kakko...",Manga,Komedi romantis tentang Nagi pergi yang harus ...,"Comedy, Harem, Romance, School life, Shounen",Miki Yoshikawa,-,7.6,https://westmanga.fun/wp-content/uploads/2021/...,-
4,The Cursed Strongest Player Dominates Again in...,The Cursed Strongest Player Rules the World; 呪...,Manga,Di dunia yang dihuni oleh berbagai ras seperti...,"Action, Fantasy, Game",-,-,7.0,https://westmanga.fun/wp-content/uploads/2024/...,-


In [34]:
df.to_csv('data/westmanga.csv', index=False, encoding='utf-8')