# Category Preparation

# Setting Up Necessary Things

In [1]:
# Jupyter Notebook Magic Command - Auto Reloading
%reload_ext autoreload
%autoreload 2

# Jupyter Notebook Magic Command - Inline Plotting
%matplotlib inline

In [2]:
# Ignore All Warnings
import warnings
warnings.filterwarnings("ignore")

# Necessary Imports

In [3]:
import os
from IPython.display import display
import re

# Data
import pandas as pd

# Language
from langdetect import detect

# Data Cleaning

In [4]:
# Load All DataFrames
folder = "../../data/processed/"

anime_df = pd.read_csv(folder + "anime/anime_rating.csv")
book_df = pd.read_csv(folder + "book/book_rating.csv")
movie_df = pd.read_csv(folder + "movie/movie_rating.csv")
music_df = pd.read_csv(folder + "music/music_views.csv")
tv_show_df = pd.read_csv(folder + "tv_show/tv_show_rating.csv")
video_game_df = pd.read_csv(folder + "video_game/video_game.csv")

In [5]:
# Show All DataFrame's Shape
print(f"Anime Shape: {anime_df.shape}")
print(f"Book Shape: {book_df.shape}")
print(f"Movie Shape: {movie_df.shape}")
print(f"Music Shape: {music_df.shape}")
print(f"TV Show Shape: {tv_show_df.shape}")
print(f"Video Game Shape: {video_game_df.shape}")

Anime Shape: (9206, 6)
Book Shape: (44129, 6)
Movie Shape: (112119, 6)
Music Shape: (1896130, 6)
TV Show Shape: (3482, 6)
Video Game Shape: (73123, 5)


In [6]:
slicing = int(tv_show_df.shape[0] * 2.5)
slicing

8705

## Anime DataFrame

In [7]:
# Anime DataFrame
anime_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","Action, Award Winning, Sci-Fi",8.75,Action,anime
1,Cowboy Bebop: The Movie,"Another day, another bounty‚Äîsuch is the life o...","Action, Sci-Fi",8.38,Action,anime
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","Action, Adventure, Sci-Fi",8.22,Action,anime
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"Action, Drama, Mystery, Supernatural",7.25,Action,anime
4,Beet the Vandel Buster,It is the dark century and the people are suff...,"Adventure, Fantasy, Supernatural",6.94,Adventure,anime


In [8]:
print(f"Anime Shape: {anime_df.shape}")

Anime Shape: (9206, 6)


In [9]:
# Get Random Slicing Data From Anime DataFrame
anime_df = anime_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [10]:
# Get only description and category
anime_df = anime_df.loc[:, ["description", "category"]]

In [11]:
anime_df.shape

(8705, 2)

In [12]:
anime_df.head()

Unnamed: 0,description,category
0,Freelance photographer Kouhei Morioka is trave...,anime
1,Based on original story by Kinoshita Renzo.,anime
2,Fireball takes place during the war between hu...,anime
3,"Although her name means ""snow white,"" Shirayuk...",anime
4,Music video for the song Hayan Geojismal by SU...,anime


## Book DataFrame

In [13]:
# Book DataFrame
book_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,The Hunger Games,Winning will make you famous. Losing means cer...,"Young Adult, Fiction, Science Fiction, Dystopi...",8.66,Young Adult,book
1,Harry Potter and the Order of the Phoenix,There is a door at the end of a silent corrido...,"Fantasy, Young Adult, Fiction",8.96,Fantasy,book
2,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical, Historical Fict...",8.54,Classics,book
3,Pride and Prejudice,¬´√à cosa ormai risaputa che a uno scapolo in po...,"Classics, Fiction, Romance",8.5,Classics,book
4,Twilight,About three things I was absolutely positive.F...,"Young Adult, Fantasy, Romance, Paranormal, Vam...",7.16,Young Adult,book


In [14]:
print(f"Book Shape: {book_df.shape}")

Book Shape: (44129, 6)


In [15]:
# Get Random Slicing Data From Book DataFrame
book_df = book_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [16]:
# Get only description and category
book_df = book_df.loc[:, ["description", "category"]]

In [17]:
book_df.shape

(8705, 2)

In [18]:
book_df.head()

Unnamed: 0,description,category
0,"""More than any other public figure of the eigh...",book
1,"Gene Baur, the cofounder and president of Farm...",book
2,A panda walks into a caf√©. He orders a sandwic...,book
3,"A rogue, Damien‚Äôs kicked about the country for...",book
4,Bored and restless in London's Restoration Cou...,book


## Movie DataFrame

In [19]:
# Movie DataFrame
movie_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,Black Panther: Wakanda Forever,The people of Wakanda fight to protect their h...,"Action, Adventure, Drama",6.9,Action,movie
1,Avatar: The Way of Water,Jake Sully lives with his newfound family form...,"Action, Adventure, Fantasy",7.8,Action,movie
2,Plane,A pilot finds himself caught in a war zone aft...,"Action, Thriller",6.5,Action,movie
3,Everything Everywhere All at Once,A middle-aged Chinese immigrant is swept up in...,"Action, Adventure, Comedy",8.0,Action,movie
4,Ant-Man and the Wasp: Quantumania,"Scott Lang and Hope Van Dyne, along with Hank ...","Action, Adventure, Comedy",6.6,Action,movie


In [20]:
print(f"Movie Shape: {movie_df.shape}")

Movie Shape: (112119, 6)


In [21]:
# Get Random Slicing Data From Movie DataFrame
movie_df = movie_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [22]:
# Get only description and category
movie_df = movie_df.loc[:, ["description", "category"]]

In [23]:
movie_df.shape

(8705, 2)

In [24]:
movie_df.head()

Unnamed: 0,description,category
0,Miles of Love is a story of how an engaged and...,movie
1,Michael returns home to Chicago from Operation...,movie
2,"The feature length indie cyber thriller ""black...",movie
3,"A reclusive, introverted singer-songwriter fal...",movie
4,This film dramatizes one of the most famous le...,movie


## Music DataFrame

In [25]:
# Music DataFrame
music_df.head()

Unnamed: 0,title,description,genres,views,primary_genre,category
0,Killa Cam,"\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa ...",rap,173166,rap,music
1,Can I Live,"\n\n\nYeah, hah, yeah, Roc-A-Fella\nWe invite ...",rap,468624,rap,music
2,Forgive Me Father,Maybe cause I'm eatin\nAnd these bastards fien...,rap,4743,rap,music
3,Down and Out,"\n\n\nUgh, Killa!\nBaby!\nKanye, this that 197...",rap,144404,rap,music
4,Fly In,"\nSo they ask me\n""Young boy\nWhat you gon' do...",rap,78271,rap,music


In [26]:
print(f"Music Shape: {music_df.shape}")

Music Shape: (1896130, 6)


In [27]:
# Get Random Slicing Data From Music DataFrame
music_df = music_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [28]:
# Get only description and category
music_df = music_df.loc[:, ["description", "category"]]

In [29]:
music_df.shape

(8705, 2)

In [30]:
music_df.head()

Unnamed: 0,description,category
0,Einen gekr√∂nten reien\nSang ich der frouwen mi...,music
1,"\nYo, I'm the Anti-Circle!\nOn the mad train l...",music
2,From a distant horizon she wanders my way\nWit...,music
3,◊õ◊ú ◊î◊†◊©◊ô◊ù ◊î◊û◊ì◊ï◊õ◊ê◊ï◊™\n◊©◊§◊ï◊ò◊ï◊™ ◊¢◊ú ◊î◊°◊ô◊®◊ô◊ù\n◊©◊ï◊û◊¢◊ï◊™ ◊û◊ô...,music
4,Was hat die Zeit aus dir gemacht\nZu viele Tr√§...,music


## TV Show DataFrame

In [31]:
# TV Show DataFrame
tv_show_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,The Three Stooges,The Three Stooges were an American vaudeville ...,"comedy, family",8.5,comedy,tv show
1,The Jack Benny Program,Laugh along with funnyman Jack Benny as he bri...,comedy,8.6,comedy,tv show
2,What's My Line?,Four panelists must determine guests' occupati...,"reality, family",8.5,reality,tv show
3,The Avengers,The Avengers is a British television series cr...,"scifi, action, crime, romance, thriller, comedy",8.3,scifi,tv show
4,Mister Rogers' Neighborhood,Mister Rogers' Neighborhood is an American chi...,"family, fantasy, music",8.7,family,tv show


In [32]:
print(f"TV Show Shape: {tv_show_df.shape}")

TV Show Shape: (3482, 6)


In [33]:
# Random Shuffle TV Show DataFrame
tv_show_df = tv_show_df.sample(frac = 1, random_state=42).reset_index(drop=True)

In [34]:
# Get only description and category
tv_show_df = tv_show_df.loc[:, ["description", "category"]]

In [35]:
tv_show_df.shape

(3482, 2)

In [36]:
tv_show_df.head()

Unnamed: 0,description,category
0,"Philip J. Fry, a pizza delivery boy, is accide...",tv show
1,"In this series, Tripadvisor challenges travele...",tv show
2,Follows the revenge story of a group of people...,tv show
3,The Incredible Hulk is an American animated te...,tv show
4,"The series revolves around Gabo, a soccer-lovi...",tv show


## Video Game DataFrame

In [37]:
# Video Game DataFrame
video_game_df.head()

Unnamed: 0,title,description,genres,primary_genre,category
0,Galactic Bowling,Galactic Bowling is an exaggerated and stylize...,"Casual,Indie,Sports",Casual,game
1,Train Bandit,THE LAW!! Looks to be a showdown atop a train....,"Action,Indie",Action,game
2,Jolt Project,Jolt Project: The army now has a new robotics ...,"Action,Adventure,Indie,Strategy",Action,game
3,Henosis‚Ñ¢,HENOSIS‚Ñ¢ is a mysterious 2D Platform Puzzler w...,"Adventure,Casual,Indie",Adventure,game
4,Two Weeks in Painland,ABOUT THE GAME Play as a hacker who has arrang...,"Adventure,Indie",Adventure,game


In [38]:
print(f"Video Game Shape: {video_game_df.shape}")

Video Game Shape: (73123, 5)


In [39]:
# Get Random Slicing Data From Video Game DataFrame
video_game_df = video_game_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [40]:
# Get only description and category
video_game_df = video_game_df.loc[:, ["description", "category"]]

In [41]:
video_game_df.shape

(8705, 2)

In [42]:
video_game_df.head()

Unnamed: 0,description,category
0,‚ÄúTreasure Girl‚Äù is our new game with 2 beautif...,game
1,Race for big money while running away from the...,game
2,'This Bum-bo game! it about time Bum-bo got co...,game
3,Portal&trade; is a new single player game from...,game
4,Collect allies and weapons in parallel worlds ...,game


# Combine All The Dataframes

In [43]:
def get_combined_df(df_list):
    combined_df = pd.concat(df_list, axis=0)
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return combined_df

In [44]:
entertainment_df_list = [anime_df, book_df, movie_df, music_df, tv_show_df, video_game_df]

In [45]:
entertainment_df = get_combined_df(entertainment_df_list)

In [46]:
# Drop Duplicates
entertainment_df = entertainment_df.drop_duplicates(subset="description", keep="first")

In [47]:
def get_df_info(df):
    print("Shape of the dataframe: ", df.shape)
    print("The DataFrame: ")
    display(df.head())
    print("All Data Types: ")
    display(df.dtypes)
    print("Null Values:")
    display(df.isna().sum())
    print("Duplicate Description Count: ", df["description"].duplicated().sum())
    print("DataFrame Details:")
    display(df.describe(include="all"))

In [48]:
get_df_info(entertainment_df)

Shape of the dataframe:  (46997, 2)
The DataFrame: 


Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Ahora s√©\nPerdemos tiempo\nEn peleas sin raz√≥n...,music
2,Jorge Luis Borges declared The Invention of Mo...,book
3,ÿßŸÑŸÉÿ™ÿßÿ® ŸÖÿ™ŸàŸÅÿ± ŸÑŸÑÿ™ÿ≠ŸÖŸäŸÑ ÿπŸÑŸâ ÿßŸÑŸÖŸàŸÇÿπ.Ÿàÿ≥ÿßŸàÿ≥ ÿßŸÑŸÜŸéŸëŸÅŸíÿ≥...,book
4,Thee devil shall roam the earth\nAnd plague an...,music


All Data Types: 


description    object
category       object
dtype: object

Null Values:


description    0
category       0
dtype: int64

Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,description,category
count,46997,46997
unique,46997,6
top,What could be worse than letting billions die?...,book
freq,1,8705


# Remove Language Except English

In [49]:
# Language Detect Method
def detect_language(text):
    try:
        lang = detect(text)
        return lang == "en"
    except:
        return False

In [50]:
entertainment_english_df = entertainment_df[entertainment_df["description"].apply(detect_language)]

In [51]:
entertainment_english_df = entertainment_english_df.reset_index(drop=True)

In [52]:
get_df_info(entertainment_english_df)

Shape of the dataframe:  (42540, 2)
The DataFrame: 


Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Jorge Luis Borges declared The Invention of Mo...,book
2,Thee devil shall roam the earth\nAnd plague an...,music
3,"(Hook)\nThem niggas over there, never play fai...",music
4,Jigsaw Puzzle: Beach Season ‚Äì the journey to t...,game


All Data Types: 


description    object
category       object
dtype: object

Null Values:


description    0
category       0
dtype: int64

Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,description,category
count,42540,42540
unique,42540,6
top,What could be worse than letting billions die?...,movie
freq,1,8676


In [53]:
entertainment_english_df.head()

Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Jorge Luis Borges declared The Invention of Mo...,book
2,Thee devil shall roam the earth\nAnd plague an...,music
3,"(Hook)\nThem niggas over there, never play fai...",music
4,Jigsaw Puzzle: Beach Season ‚Äì the journey to t...,game


# Remove Unnecessary Characters

In [54]:
# Getting Clean Text Method
def get_clean_text(text):
    clean_text =  (text.replace("\n", " ")
                   .replace("üï∫üèæ", " ")
                   .replace("\r", " ")
                   .replace("¬†", " ")
                   .replace("‚Üë", " ")
                   .replace("‚Üì", " ")
                   .replace("‚Üê", " ")
                   .replace("‚Üí", " "))
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text

In [55]:
entertainment_english_df["description"][0]

"What could be worse than letting billions die?\r\r\n\r\r\nIn the near future, to escape the crush and clutter of a packed and polluted Earth, the world's elite flock to Atopia, an enormous corporate-owned artificial island in the Pacific Ocean. It is there that Dr. Patricia Killiam rushes to perfect the ultimate in virtual reality: a program to save the ravaged Earth from mankind's insatiable appetite for natural resources.\xa0\r\r\nThe Atopia Chronicles (Book 1 of the Atopia series) is the tale of mankind's dark slide across the apocalypse as humans and machines merge in a world teetering on the brink of ecological ruin.\r\r\n"

In [56]:
get_clean_text(entertainment_english_df["description"][0])

"What could be worse than letting billions die? In the near future, to escape the crush and clutter of a packed and polluted Earth, the world's elite flock to Atopia, an enormous corporate-owned artificial island in the Pacific Ocean. It is there that Dr. Patricia Killiam rushes to perfect the ultimate in virtual reality: a program to save the ravaged Earth from mankind's insatiable appetite for natural resources. The Atopia Chronicles (Book 1 of the Atopia series) is the tale of mankind's dark slide across the apocalypse as humans and machines merge in a world teetering on the brink of ecological ruin. "

In [57]:
for i in range(entertainment_english_df.shape[0]):
    ct = get_clean_text(entertainment_english_df["description"][i])
    entertainment_english_df.loc[i, "description"] = ct

In [61]:
entertainment_english_df.head()

Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Jorge Luis Borges declared The Invention of Mo...,book
2,Thee devil shall roam the earth And plague and...,music
3,"(Hook) Them niggas over there, never play fair...",music
4,Jigsaw Puzzle: Beach Season ‚Äì the journey to t...,game


In [63]:
# Drop Duplicates
entertainment_english_df = entertainment_english_df.drop_duplicates(subset="description", keep="first")

In [64]:
get_df_info(entertainment_english_df)

Shape of the dataframe:  (42535, 2)
The DataFrame: 


Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Jorge Luis Borges declared The Invention of Mo...,book
2,Thee devil shall roam the earth And plague and...,music
3,"(Hook) Them niggas over there, never play fair...",music
4,Jigsaw Puzzle: Beach Season ‚Äì the journey to t...,game


All Data Types: 


description    object
category       object
dtype: object

Null Values:


description    0
category       0
dtype: int64

Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,description,category
count,42535,42535
unique,42535,6
top,What could be worse than letting billions die?...,movie
freq,1,8676


# DataFrame To CSV

In [65]:
def write_dataframe_to_csv(path, dataframe):
    if os.path.exists(path):
        print(f"The file already exists ...! [Find the file in the location '{path}']")
    else:
        dataframe.to_csv(path, index=False)
        print("Dataframe saved successfully: ", path)

In [68]:
file_path = "../../data/processed/category/entertainment_category.csv"

write_dataframe_to_csv(file_path, entertainment_english_df)

Dataframe saved successfully:  ../../data/processed/category/entertainment_category.csv


In [69]:
ddf = pd.read_csv("../../data/processed/category/entertainment_category.csv")
get_df_info(ddf)

Shape of the dataframe:  (42535, 2)
The DataFrame: 


Unnamed: 0,description,category
0,What could be worse than letting billions die?...,book
1,Jorge Luis Borges declared The Invention of Mo...,book
2,Thee devil shall roam the earth And plague and...,music
3,"(Hook) Them niggas over there, never play fair...",music
4,Jigsaw Puzzle: Beach Season ‚Äì the journey to t...,game


All Data Types: 


description    object
category       object
dtype: object

Null Values:


description    0
category       0
dtype: int64

Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,description,category
count,42535,42535
unique,42535,6
top,What could be worse than letting billions die?...,movie
freq,1,8676
