In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

df_anime = pd.read_csv('anime_data.csv')
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1000 non-null   int64  
 1   rank        1000 non-null   int64  
 2   score       1000 non-null   float64
 3   name        1000 non-null   object 
 4   link        1000 non-null   object 
 5   showType    1000 non-null   object 
 6   numOfEp     993 non-null    float64
 7   startDate   1000 non-null   object 
 8   endDate     980 non-null    object 
 9   members     1000 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 78.3+ KB


# What are the top 10 highest ranked anime series?

In [7]:
top_anime = df_anime.sort_values(by='rank', ascending=True)


top_10 = top_anime.head(10)
top_10[['rank', 'name']]


Unnamed: 0,rank,name
0,1,Sousou no Frieren
1,2,Fullmetal Alchemist: Brotherhood
2,3,Steins;Gate
3,4,Shingeki no Kyojin Season 3 Part 2
4,5,One Piece Fan Letter
5,6,Gintama°
6,7,Gintama: The Final
7,8,Hunter x Hunter (2011)
8,9,Gintama'
9,10,Gintama': Enchousen


# Which anime series have the largest member/viewer base (top 10)?

In [6]:
top_anime = df_anime.sort_values(by='members', ascending=True)


top_10 = top_anime.head(10)
top_10[['members', 'name']]


Unnamed: 0,members,name
877,1294,Fanren Xiu Xian Chuan: Waihai Fengyun
731,1424,Tunshi Xingkong Movie: Xueluo Dalu
941,1555,Wu Dong Qian Kun 5th Season
903,2395,Wu Dong Qian Kun 4th Season
728,2397,Zhu Xian 2nd Season
709,2573,Benghuai: Xing Qiong Tiedao - Gelamo de Yujin
786,2621,Aikatsu! 10th Story: Mirai e no Starway (2023)
976,2698,Yi Nian Yong Heng 3rd Season
388,2763,Fanren Xiu Xian Chuan: Xinghai Feichi
870,3457,Yuanshen: Weixing zhi Lu


# What are the top 10 anime series with the most episodes?

In [8]:
top_anime_by_episodes = df_anime.sort_values(by='numOfEp', ascending=False)
top_10_episodes = top_anime_by_episodes.head(10)
top_10_episodes[['numOfEp', 'name']]


Unnamed: 0,numOfEp,name
886,1787.0,Doraemon (1979)
306,500.0,Naruto: Shippuuden
710,366.0,Bleach
387,291.0,Dragon Ball Z
249,237.0,Douluo Dalu 2nd Season
902,234.0,Wanmei Shijie
662,220.0,Naruto
493,203.0,Katekyou Hitman Reborn!
16,201.0,Gintama
936,178.0,Tennis no Oujisama


# Is there a correlation between the number of members/viewers and the anime's score rating?

In [9]:
animeCorr = df_anime.dropna(subset=['members', 'score'])

# Calculate the correlation coefficient
correlation = animeCorr['members'].corr


# What are the lowest 10 ranked anime series?

In [10]:
bottom_anime = df_anime.sort_values(by='rank', ascending=True)


bottom_10 = bottom_anime.tail(10)
bottom_10[['rank', 'name']]


Unnamed: 0,rank,name
990,991,Kuroko no Basket Movie 3: Winter Cup - Tobira ...
991,992,Mairimashita! Iruma-kun 3rd Season
992,993,New Initial D Movie: Legend 2 - Tousou
993,994,One Piece Film: Red
994,995,Seitokai Yakuindomo Movie
995,996,Somali to Mori no Kamisama
996,997,Sonny Boy
997,998,Suzumiya Haruhi no Yuuutsu
998,999,Tokyo Revengers: Tenjiku-hen
999,1000,Uchouten Kazoku


# -----------------
# Data PreProcessing
# ------------------

# 1. **Inspect the Data**


In [None]:
df_anime.info()
df_anime[['members', 'score']].describe()

# 2. Handling Missing or Null Values

In [None]:
anime_dfd = df_anime.dropna(subset=['members', 'score'])
df_anime.info()

# fill missing Value

In [None]:
df_anime['members'].fillna(df_anime['members'].median(), inplace=True)
df_anime['score'].fillna(df_anime['score'].mean(), inplace=True)
df_anime.info()

# 3. Remove Duplicates

In [None]:
anime_dup = df_anime.drop_duplicates(subset=['members', 'score'])
anime_dup.info()