In [21]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
# Import Dataset
anime_df = pd.read_csv("data/anime.csv")
manga_df = pd.read_csv("data/manga.csv")

In [23]:
anime_df.shape

(10000, 15)

In [24]:
# Check for Missing Values
anime_df.isnull().sum()

Title            0
Score            0
Vote             0
Ranked           0
Popularity       0
Episodes         0
Status           0
Aired            0
Premiered     5122
Producers        0
Licensors        0
Studios          0
Source           0
Duration         0
Rating          20
dtype: int64

In [25]:
manga_df.isnull().sum()

Title               0
Score               0
Vote                0
Ranked              0
Popularity          0
Members             0
Favorite            0
Volumes             0
Chapters            0
Status              0
Published           0
Genres              0
Themes              0
Demographics        0
Serialization    1789
Author             15
dtype: int64

In [26]:
# Drop Column named Premiered from the anime_df dataset
anime_df.drop(columns=["Premiered"], inplace=True)

In [27]:
# Check for Missing Values
anime_df.isnull().sum()

Title          0
Score          0
Vote           0
Ranked         0
Popularity     0
Episodes       0
Status         0
Aired          0
Producers      0
Licensors      0
Studios        0
Source         0
Duration       0
Rating        20
dtype: int64

In [28]:
anime_df.shape

(10000, 14)

In [29]:
# Check for Duplicates
anime_df.duplicated().sum()

1631

In [31]:
# Drop Duplicates from the anime_df dataset
anime_df.drop_duplicates(inplace=True)

In [30]:
manga_df.duplicated().sum()

0

In [32]:
# Data Types
anime_df.dtypes

Title          object
Score         float64
Vote            int64
Ranked          int64
Popularity      int64
Episodes       object
Status         object
Aired          object
Producers      object
Licensors      object
Studios        object
Source         object
Duration       object
Rating         object
dtype: object

In [33]:
manga_df.dtypes

Title             object
Score            float64
Vote               int64
Ranked             int64
Popularity         int64
Members           object
Favorite          object
Volumes           object
Chapters          object
Status            object
Published         object
Genres            object
Themes            object
Demographics      object
Serialization     object
Author            object
dtype: object

In [35]:
# Convert numerical columns to int64

anime_df[["Episodes"]] = anime_df[["Episodes"]].apply(pd.to_numeric, errors='coerce')
manga_df[["Members", "Favorite", "Volumes", "Chapters"]] = manga_df[["Members", "Favorite", "Volumes", "Chapters"]].apply(pd.to_numeric, errors='coerce')

In [36]:
# Data Types
anime_df.dtypes

Title          object
Score         float64
Vote            int64
Ranked          int64
Popularity      int64
Episodes      float64
Status         object
Aired          object
Producers      object
Licensors      object
Studios        object
Source         object
Duration       object
Rating         object
dtype: object

In [37]:
manga_df.dtypes

Title             object
Score            float64
Vote               int64
Ranked             int64
Popularity         int64
Members          float64
Favorite         float64
Volumes          float64
Chapters         float64
Status            object
Published         object
Genres            object
Themes            object
Demographics      object
Serialization     object
Author            object
dtype: object

In [39]:
# Check for outliers
anime_df.describe()

Unnamed: 0,Score,Vote,Ranked,Popularity,Episodes
count,8369.0,8369.0,8369.0,8369.0,8327.0
mean,7.392458,79436.36,2579.940495,4014.317481,16.055722
std,0.423811,190722.7,1400.953201,3086.50444,35.572343
min,6.8,109.0,1.0,1.0,1.0
25%,7.08,4487.0,1433.0,1541.0,1.0
50%,7.31,17755.0,2592.0,3343.0,12.0
75%,7.61,65712.0,3766.0,5789.0,16.0
max,9.14,2745016.0,5026.0,16554.0,1787.0


In [40]:
manga_df.describe()

Unnamed: 0,Score,Vote,Ranked,Popularity,Members,Favorite,Volumes,Chapters
count,10000.0,10000.0,10000.0,10000.0,2091.0,9526.0,7382.0,7479.0
mean,7.368518,3505.6671,5000.7863,8737.9255,672.515065,70.472076,6.645354,47.929937
std,0.377491,14026.002244,2887.469732,6707.791147,193.726656,144.0293,8.215789,77.81746
min,6.9,100.0,1.0,1.0,203.0,0.0,1.0,1.0
25%,7.08,318.0,2500.75,3104.5,528.5,6.0,2.0,8.0
50%,7.28,792.0,4992.5,7351.5,688.0,17.0,4.0,23.0
75%,7.56,2248.0,7500.25,13281.25,829.0,58.0,9.0,56.0
max,9.47,401815.0,10060.0,32962.0,998.0,999.0,200.0,1957.0


In [43]:
# Export Cleaned Dataset
anime_df.to_csv("data/clean_anime.csv", index=False)
manga_df.to_csv("data/clean_manga.csv", index=False)