In [2]:
import pandas as pd
import re

# Load the data
df_mangadex = pd.read_csv("../get_data/data/mangadex/mangadex-raw.csv")

# Function to check if a string is a valid URL
def is_valid_url(url):
    # Regex pattern to match URLs
    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(regex, url) is not None

# Filter the dataframe to keep only rows with valid URLs in the "image" column
df_mangadex = df_mangadex[df_mangadex['image'].apply(lambda x: is_valid_url(str(x)))]

# Remove duplicate rows
df_mangadex.drop_duplicates(inplace=True)

# Print the shape of the cleaned dataframe
print('Mangadex : ', df_mangadex.shape)


Mangadex :  (23769, 10)


In [3]:
df_mangadex.to_csv('mangadex.csv', index=False, encoding='utf-8')

In [4]:
df_mangadex[df_mangadex['title'] == 'One Piece']

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
3800,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2677,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
10680,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2671,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
23225,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2672,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
29686,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2666,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
36286,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2665,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
46206,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2664,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
89988,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2657,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
96650,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.2659,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
102714,One Piece,"원피스, 海贼王, 海盗路飞, 航海王, 海賊王, One Piece. Большой к...",manga,"Gol D. Roger, a man referred to as the ""Pirate...","Award Winning, Sci-Fi, Monsters, Action, Anima...",Oda Eiichiro,Oda Eiichiro,9.4764,https://mangadex.org/covers/a1c7c817-4e59-43b7...,1997
