In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

df_anime = pd.read_csv('anime_data.csv')
df_anime.info()

# What are the top 10 highest ranked anime series?

In [None]:
top_anime = df_anime.sort_values(by='rank', ascending=True)


top_10 = top_anime.head(10)
top_10[['rank', 'name']]


# Which anime series have the largest member/viewer base (top 10)?

In [None]:
top_anime = df_anime.sort_values(by='members', ascending=True)


top_10 = top_anime.head(10)
top_10[['members', 'name']]


# What are the top 10 anime series with the most episodes?

In [None]:
top_anime_by_episodes = df_anime.sort_values(by='numOfEp', ascending=False)
top_10_episodes = top_anime_by_episodes.head(10)
top_10_episodes[['numOfEp', 'name']]


# Is there a correlation between the number of members/viewers and the anime's score rating?

In [None]:
animeCorr = df_anime.dropna(subset=['members', 'score'])

# Calculate the correlation coefficient
correlation = animeCorr['members'].corr


# -----------------
# Data PreProcessing
# ------------------

# 1. **Inspect the Data**


In [None]:
df_anime.info()
df_anime[['members', 'score']].describe()

# 2. Handling Missing or Null Values

In [None]:
anime_dfd = df_anime.dropna(subset=['members', 'score'])
df_anime.info()

# fill missing Value

In [None]:
df_anime['members'].fillna(df_anime['members'].median(), inplace=True)
df_anime['score'].fillna(df_anime['score'].mean(), inplace=True)
df_anime.info()

# 3. Handling Outliers

In [None]:
plt.boxplot(df_anime[['members', 'score']], labels=['members', 'score'])
plt.show()

# 4. Remove Duplicates

In [None]:
anime_dup = df_anime.drop_duplicates(subset=['members', 'score'])
anime_dup.info()