# Data Loading & Cleaning

### Imports

In [21]:
#Imports
import pandas as pd

### Data Loading

In [22]:
#Load files into dataframes
df_anime = pd.read_csv("Dataset/anime.csv")
df_manga = pd.read_csv("Dataset/manga.csv")

#Load stats
df_anime_stats = pd.read_csv("Dataset/anime_stats.csv")
df_manga_stats = pd.read_csv("Dataset/manga_stats.csv")

### Data Cleaning

In [23]:
#Drop rows that contain nsfw.
df_anime = df_anime.drop(df_anime[df_anime.sfw == False].index)
df_manga = df_manga.drop(df_manga[df_manga.sfw == False].index)

In [24]:
#Drop rows that are not yet approved.
df_anime = df_anime.drop(df_anime[df_anime.approved == False].index)
df_manga = df_manga.drop(df_manga[df_manga.approved == False].index)

In [25]:
#Drop columns in anime data.
df_anime = df_anime.drop([
    "episodes", "start_date", "end_date",
    "episode_duration", "total_duration",
    "sfw", "approved", "created_at",
    "updated_at", "start_season", "real_start_date",
    "real_end_date", "broadcast_day", "broadcast_time",
    "studios", "producers", "licensors",
    "synopsis", "background", "url",
    "title_japanese", "title_synonyms"
    ], axis = 1
)

In [26]:
#Drop columns in manga data.
df_manga = df_manga.drop([
    "end_date", "sfw", "approved",
    "created_at_before", "updated_at", "real_start_date",
    "real_end_date", "synopsis", "background",
    "url", "title_japanese", "title_synonyms",
    "volumes", "chapters", "authors",
    "serializations", "jikan"
    ], axis = 1
)

In [27]:
df_manga

Unnamed: 0,manga_id,title,type,score,scored_by,status,start_date,members,favorites,genres,themes,demographics,main_picture,title_english
0,2,Berserk,manga,9.45,268737,currently_publishing,1989-08-25,551266,103820,"['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",['Seinen'],https://cdn.myanimelist.net/images/manga/1/157...,Berserk
1,13,One Piece,manga,9.20,305917,currently_publishing,1997-07-22,501291,99526,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],https://cdn.myanimelist.net/images/manga/2/253...,One Piece
2,25,Fullmetal Alchemist,manga,9.05,143879,finished,2001-07-12,264571,28519,"['Action', 'Adventure', 'Award Winning', 'Dram...",['Military'],['Shounen'],https://cdn.myanimelist.net/images/manga/3/243...,Fullmetal Alchemist
3,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,manga,9.27,125482,finished,2004-01-19,209123,35456,"['Action', 'Adventure', 'Horror', 'Mystery', '...",['Historical'],"['Seinen', 'Shounen']",https://cdn.myanimelist.net/images/manga/3/179...,
4,4632,Oyasumi Punpun,manga,9.03,141146,finished,2007-03-15,354176,43168,"['Drama', 'Slice of Life']",['Psychological'],['Seinen'],https://cdn.myanimelist.net/images/manga/3/164...,Goodnight Punpun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59967,145183,Shikumareta Jouji,manga,,1,finished,2011-06-30,4,0,['Romance'],[],['Josei'],https://cdn.myanimelist.net/images/manga/1/263...,Bought: The Greek's Baby
59968,139311,A Compendium of Ghosts,manhwa,,0,finished,2013-05-22,3,0,"['Drama', 'Supernatural']",['Historical'],[],https://cdn.myanimelist.net/images/manga/1/265...,A Compendium of Ghosts
59969,149561,Red Velvet,manga,,0,finished,2018-10-22,3,0,['Drama'],[],['Seinen'],https://cdn.myanimelist.net/images/manga/1/265...,
59980,146734,Red Es,manga,,0,currently_publishing,2022-05-09,1,0,"['Mystery', 'Supernatural']",[],['Seinen'],https://cdn.myanimelist.net/images/manga/3/265...,


### Missing data

In [35]:
#Check stats.
print(df_anime_stats.isnull().values.any())
print(df_manga_stats.isnull().values.any())

False
False


In [29]:
#Remove rows with missing scores.
df_anime = df_anime[pd.to_numeric(df_anime["score"],errors="coerce").notnull()]
df_manga = df_manga[pd.to_numeric(df_manga["score"],errors="coerce").notnull()]

In [36]:
#Check manga sales.
df_sales = pd.read_csv("Clean_Dataset/manga_sales.csv", encoding = 'cp1252')
df_sales.isnull().values.any()

False

### Join files

In [33]:
#Merge
anime_merged = pd.merge(df_anime, df_anime_stats, on = "anime_id")
manga_merged = pd.merge(df_manga, df_manga_stats, on = "manga_id")

In [37]:
print(anime_merged.isnull().values.any())
print(manga_merged.isnull().values.any())

True
True


### Create new files

In [34]:
anime_merged.to_csv('Clean_Dataset/anime_clean.csv', index = False)
manga_merged.to_csv('Clean_Dataset/manga_clean.csv', index = False)