In [1]:
import numpy as np
import pandas as pd
import re
from ast import literal_eval as le
import warnings as w
w.filterwarnings("ignore")

In [2]:
df = pd.read_csv("anime_data.csv")
pd.set_option("display.max_columns", None)

In [3]:
df.anime_producer = df.anime_producer.replace("['NA']", df.anime_producer.mode()[0])
df.anime_mal_score = pd.to_numeric(df.anime_mal_score, errors="coerce")
df.anime_mal_score = df.anime_mal_score.replace(np.nan, df.anime_mal_score.median())

In [4]:
# Dropping the rows with NaN values

df.dropna(inplace=True)

In [5]:
df.anime_genres = df.anime_genres.apply(le)
df.anime_producer = df.anime_producer.apply(le)
df.anime_genres = df.anime_genres.apply(lambda x: [i.replace(" ", "") for i in x])
df.anime_studio = df.anime_studio.apply(lambda x: x.replace(" ", ""))
df.anime_studio = df.anime_studio.apply(lambda x: x.split())

In [6]:
df.head()

Unnamed: 0,index,anime_urls,anime_poster,anime_title,anime_overview,anime_mal_score,anime_views,anime_studio,anime_producer,anime_genres
0,0,https://sanji.to/bungaku-shoujo-kyou-no-oyatsu...,https://img.zorores.com/_r/300x400/100/59/ce/5...,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",\n Shor...,6.92,757,[ProductionI.G],"[Lantis, Pony Canyon, Enterbrain, Kadokawa Con...","[Comedy, Fantasy, School]"
1,1,https://sanji.to/bungaku-shoujo-memoire-2108,https://img.zorores.com/_r/300x400/100/ec/bd/e...,"""Bungaku Shoujo"" Memoire",\n Epis...,7.35,1284,[ProductionI.G],"[Lantis, Pony Canyon, Enterbrain, Kadokawa Con...","[Drama, Romance, School]"
2,2,https://sanji.to/bungaku-shoujo-movie-1802,https://img.zorores.com/_r/300x400/100/5c/85/5...,"""Bungaku Shoujo"" Movie",\n The ...,7.43,1726,[ProductionI.G],"[Lantis, Pony Canyon, Enterbrain, Kadokawa Con...","[Mystery, Drama, Romance, School]"
3,3,https://sanji.to/my-star-18330,https://img.zorores.com/_r/300x400/100/99/59/9...,My Star,\n Sixt...,9.33,4450861,[DogaKobo],"[Kadokawa, Shueisha]","[Drama, Music, Supernatural, Seinen]"
4,4,https://sanji.to/hackgu-returner-4688,https://img.zorores.com/_r/300x400/100/e1/ff/e...,.hack//G.U. Returner,\n The ...,6.73,270,[BeeTrain],"[Bandai Visual, CyberConnect2, Bee Train]","[Adventure, Drama, Fantasy, Game, Magic, Sci-Fi]"


In [7]:
#Fetch wordcount for each abstract

df['word_count'] = df['anime_overview'].apply(lambda x: len(str(x).split(" ")))
df[['anime_overview','word_count']].head()

Unnamed: 0,anime_overview,word_count
0,\n Shor...,90
1,\n Epis...,88
2,\n The ...,187
3,\n Sixt...,224
4,\n The ...,146


In [8]:
##Descriptive statistics of word counts

df.word_count.describe()

count    5447.000000
mean      173.550395
std        65.855962
min         1.000000
25%       125.000000
50%       172.000000
75%       225.000000
max       533.000000
Name: word_count, dtype: float64

In [9]:
#Identify common words
freq = pd.Series(' '.join(df['anime_overview']).split()).value_counts()[:20]
freq

the      33572
to       18293
of       15981
a        15686
and      15545
is        8026
in        7833
his       6511
with      5296
by        4928
her       4885
that      4345
as        4338
their     3934
he        3755
for       3680
on        2885
an        2879
who       2817
from      2724
Name: count, dtype: int64

In [10]:
# Identify uncommon words
freq1 = pd.Series(" ".join(df["anime_overview"]).split()).value_counts()[-20:]
freq1

frees          1
jungle's       1
Jimmy,         1
Cousin         1
Laura,         1
introduced.    1
"Quirk         1
Montgomery.    1
Maud           1
Wataru.        1
Sophia,        1
Francisco,     1
Doomsday       1
enhancement    1
underwent      1
Theory."       1
Quirk          1
genes,         1
Imp            1
Kakana,        1
Name: count, dtype: int64

# Text Preprocessing

In [11]:
# Removing the last part from the overview which is [Written by MAL Rewrite] 

df['anime_overview'] =  df['anime_overview'].apply(lambda x: x.split(r"[Written by MAL Rewrite]")[0])

# Taking Only Words
df['anime_overview'] = df['anime_overview'].apply(lambda x: " ".join(re.findall(r'[a-zA-Z]+', x)).lower())

# Removing all s and t from the overview which are not in the form of words

df['anime_overview'] = df['anime_overview'].apply(lambda x: re.sub(r'\bs\b|\bt\b','',x))


In [12]:
# Dropping the index column and resetting the index to new values
df.drop('index', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)

In [13]:
# Changing the column names

df.rename(
    columns={
        "index": "anime_id",
        "anime_urls": "urls",
        "anime_overview": "overview",
        "anime_genres": "genres",
        "anime_producer": "producer",
        "anime_studio": "studio",
        "anime_mal_score": "score",
        "anime_poster": "poster",
        "anime_title": "title",
        "anime_views": "views",
    },
    inplace=True,
)

In [14]:
df.to_csv("anime_data_cleaned.csv", index=False)