# Title Preparation

# Setting Up Necessary Things

In [1]:
# Jupyter Notebook Magic Command - Auto Reloading
%reload_ext autoreload
%autoreload 2

# Jupyter Notebook Magic Command - Inline Plotting
%matplotlib inline

In [2]:
# Ignore All Warnings
import warnings
warnings.filterwarnings("ignore")

# Necessary Imports

In [3]:
import os
from IPython.display import display
import re

# Data
import pandas as pd

# Language
from langdetect import detect

# Data Cleaning

In [4]:
# Load All DataFrames
folder = "../../data/processed/"

anime_df = pd.read_csv(folder + "anime/anime_rating.csv")
book_df = pd.read_csv(folder + "book/book_rating.csv")
movie_df = pd.read_csv(folder + "movie/movie_rating.csv")
music_df = pd.read_csv(folder + "music/music_views.csv")
tv_show_df = pd.read_csv(folder + "tv_show/tv_show_rating.csv")
video_game_df = pd.read_csv(folder + "video_game/video_game.csv")

In [5]:
# Show All DataFrame's Shape
print(f"Anime Shape: {anime_df.shape}")
print(f"Book Shape: {book_df.shape}")
print(f"Movie Shape: {movie_df.shape}")
print(f"Music Shape: {music_df.shape}")
print(f"TV Show Shape: {tv_show_df.shape}")
print(f"Video Game Shape: {video_game_df.shape}")

Anime Shape: (9206, 6)
Book Shape: (44129, 6)
Movie Shape: (112119, 6)
Music Shape: (1896130, 6)
TV Show Shape: (3482, 6)
Video Game Shape: (73123, 5)


In [11]:
slicing = int(tv_show_df.shape[0] * 2.6)
slicing

9053

## Utility Methods

In [12]:
def get_df_info(df):
    print("Shape of the dataframe: ", df.shape)
    print("The DataFrame: ")
    display(df.head())
    print("All Data Types: ")
    display(df.dtypes)
    print("Null Values:")
    display(df.isna().sum())
    print("Duplicate Description Count: ", df["title"].duplicated().sum())
    print("Duplicate Description Count: ", df["description"].duplicated().sum())
    print("DataFrame Details:")
    display(df.describe(include="all"))

### English Language Filter

In [13]:
# Language Detect Method
def detect_language(text):
    try:
        lang = detect(text)
        return lang == "en"
    except:
        return False

In [14]:
# Get Only English Language
def get_only_english_corpus(dataframe):
    dataframe = dataframe[dataframe["title"].apply(detect_language)]
    dataframe = dataframe.reset_index(drop=True)
    dataframe = dataframe[dataframe["description"].apply(detect_language)]
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

### Unnecessary Characters Filter

In [15]:
# Getting Clean Text Method
def get_clean_text(text):
    clean_text =  (text.replace("\n", " ")
                   .replace("🕺🏾", " ")
                   .replace("\r", " ")
                   .replace(" ", " ")
                   .replace("↑", " ")
                   .replace("↓", " ")
                   .replace("←", " ")
                   .replace("→", " "))
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text

In [16]:
# Get Clean Corpus
def get_clean_df(dataframe):
    for i in range(dataframe.shape[0]):
        ct_t = get_clean_text(dataframe["title"][i])
        ct_d = get_clean_text(dataframe["description"][i])
        dataframe.loc[i, "title"] = ct_t
        dataframe.loc[i, "description"] = ct_d
    
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

## Anime DataFrame

In [17]:
# Anime DataFrame
anime_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","Action, Award Winning, Sci-Fi",8.75,Action,anime
1,Cowboy Bebop: The Movie,"Another day, another bounty—such is the life o...","Action, Sci-Fi",8.38,Action,anime
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","Action, Adventure, Sci-Fi",8.22,Action,anime
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"Action, Drama, Mystery, Supernatural",7.25,Action,anime
4,Beet the Vandel Buster,It is the dark century and the people are suff...,"Adventure, Fantasy, Supernatural",6.94,Adventure,anime


In [18]:
print(f"Anime Shape: {anime_df.shape}")

Anime Shape: (9206, 6)


In [19]:
# Get only description and category
anime_df = anime_df.loc[:, ["title", "description", "genres", "category"]]

In [20]:
print(f"Anime Shape: {anime_df.shape}")

Anime Shape: (9206, 4)


In [21]:
# Get Random Slicing Data From Anime DataFrame
anime_df = anime_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [22]:
get_df_info(anime_df)

Shape of the dataframe:  (9053, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Tsukuyomi: Moon Phase,Freelance photographer Kouhei Morioka is trave...,"Comedy, Fantasy, Romance",anime
1,Geba Geba Show Time!,Based on original story by Kinoshita Renzo.,Comedy,anime
2,Fireball,Fireball takes place during the war between hu...,"Comedy, Sci-Fi",anime
3,Snow White with the Red Hair,"Although her name means ""snow white,"" Shirayuk...","Drama, Fantasy, Romance",anime
4,Tell Me Baby,Music video for the song Hayan Geojismal by SU...,UNKNOWN,anime


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,9053,9053,9053,9053
unique,9053,9053,816,1
top,Tsukuyomi: Moon Phase,Freelance photographer Kouhei Morioka is trave...,UNKNOWN,anime
freq,1,1,930,9053


## Book DataFrame

In [23]:
# Book DataFrame
book_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,The Hunger Games,Winning will make you famous. Losing means cer...,"Young Adult, Fiction, Science Fiction, Dystopi...",8.66,Young Adult,book
1,Harry Potter and the Order of the Phoenix,There is a door at the end of a silent corrido...,"Fantasy, Young Adult, Fiction",8.96,Fantasy,book
2,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical, Historical Fict...",8.54,Classics,book
3,Pride and Prejudice,«È cosa ormai risaputa che a uno scapolo in po...,"Classics, Fiction, Romance",8.5,Classics,book
4,Twilight,About three things I was absolutely positive.F...,"Young Adult, Fantasy, Romance, Paranormal, Vam...",7.16,Young Adult,book


In [24]:
print(f"Book Shape: {book_df.shape}")

Book Shape: (44129, 6)


In [25]:
# Get only description and category
book_df = book_df.loc[:, ["title", "description", "genres", "category"]]

In [26]:
print(f"Book Shape: {book_df.shape}")

Book Shape: (44129, 4)


In [27]:
# Get Random Slicing Data From Book DataFrame
book_df = book_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [28]:
get_df_info(book_df)

Shape of the dataframe:  (9053, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Tom Paine: A Political Life,"""More than any other public figure of the eigh...","Biography, Politics, History, Nonfiction, Mili...",book
1,Living the Farm Sanctuary Life: How to Eat Hea...,"Gene Baur, the cofounder and president of Farm...","Nonfiction, Food and Drink, Food, Food and Dri...",book
2,"Eats, Shoots & Leaves: The Zero Tolerance Appr...",A panda walks into a café. He orders a sandwic...,"Nonfiction, Language, Writing, Humanities, Lan...",book
3,Betrayed: Book Two - The Road to Redemption,"A rogue, Damien’s kicked about the country for...","Fantasy, Paranormal, Romance, Paranormal Roman...",book
4,Frenchman's Creek,Bored and restless in London's Restoration Cou...,"Classics, Historical, Historical Fiction, Fict...",book


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,9053,9053,9053,9053
unique,9053,9053,7201,1
top,Tom Paine: A Political Life,"""More than any other public figure of the eigh...",Fiction,book
freq,1,1,42,9053


## Movie DataFrame

In [29]:
# Movie DataFrame
movie_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,Black Panther: Wakanda Forever,The people of Wakanda fight to protect their h...,"Action, Adventure, Drama",6.9,Action,movie
1,Avatar: The Way of Water,Jake Sully lives with his newfound family form...,"Action, Adventure, Fantasy",7.8,Action,movie
2,Plane,A pilot finds himself caught in a war zone aft...,"Action, Thriller",6.5,Action,movie
3,Everything Everywhere All at Once,A middle-aged Chinese immigrant is swept up in...,"Action, Adventure, Comedy",8.0,Action,movie
4,Ant-Man and the Wasp: Quantumania,"Scott Lang and Hope Van Dyne, along with Hank ...","Action, Adventure, Comedy",6.6,Action,movie


In [30]:
print(f"Movie Shape: {movie_df.shape}")

Movie Shape: (112119, 6)


In [31]:
# Get only description and category
movie_df = movie_df.loc[:, ["title", "description", "genres", "category"]]

In [32]:
# Get Random Slicing Data From Movie DataFrame
movie_df = movie_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [33]:
movie_df.shape

(9053, 4)

In [34]:
get_df_info(movie_df)

Shape of the dataframe:  (9053, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Miles of Love,Miles of Love is a story of how an engaged and...,"Comedy, Drama, Music",movie
1,Farewell Darkness,Michael returns home to Chicago from Operation...,"Drama, War",movie
2,Blackhats,"The feature length indie cyber thriller ""black...","Action, Drama, Thriller",movie
3,Mitthye Premer Gaan,"A reclusive, introverted singer-songwriter fal...","Music, Romance",movie
4,El sombrerón,This film dramatizes one of the most famous le...,Horror,movie


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,9053,9053,9053,9053
unique,9053,9053,611,1
top,Miles of Love,Miles of Love is a story of how an engaged and...,"Drama, Romance",movie
freq,1,1,482,9053


## Music DataFrame

In [35]:
# Music DataFrame
music_df.head()

Unnamed: 0,title,description,genres,views,primary_genre,category
0,Killa Cam,"\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa ...",rap,173166,rap,music
1,Can I Live,"\n\n\nYeah, hah, yeah, Roc-A-Fella\nWe invite ...",rap,468624,rap,music
2,Forgive Me Father,Maybe cause I'm eatin\nAnd these bastards fien...,rap,4743,rap,music
3,Down and Out,"\n\n\nUgh, Killa!\nBaby!\nKanye, this that 197...",rap,144404,rap,music
4,Fly In,"\nSo they ask me\n""Young boy\nWhat you gon' do...",rap,78271,rap,music


In [36]:
print(f"Music Shape: {music_df.shape}")

Music Shape: (1896130, 6)


In [37]:
# Get only description and category
music_df = music_df.loc[:, ["title", "description", "genres", "category"]]

In [38]:
# Get Random Slicing Data From Music DataFrame
music_df = music_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [39]:
music_df.shape

(9053, 4)

In [40]:
get_df_info(music_df)

Shape of the dataframe:  (9053, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Gonna Scare You,Einen gekrönten reien\nSang ich der frouwen mi...,rap,music
1,Belly Dance,"\nYo, I'm the Anti-Circle!\nOn the mad train l...",rap,music
2,Basic Delusions,From a distant horizon she wanders my way\nWit...,rock,music
3,Fuzzy 95,כל הנשים המדוכאות\nשפוטות על הסירים\nשומעות מי...,rock,music
4,The North Stands for Nothing,Was hat die Zeit aus dir gemacht\nZu viele Trä...,rock,music


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,9053,9053,9053,9053
unique,9053,9053,6,1
top,Gonna Scare You,Einen gekrönten reien\nSang ich der frouwen mi...,pop,music
freq,1,1,3675,9053


## TV Show DataFrame

In [41]:
# TV Show DataFrame
tv_show_df.head()

Unnamed: 0,title,description,genres,rating,primary_genre,category
0,The Three Stooges,The Three Stooges were an American vaudeville ...,"comedy, family",8.5,comedy,tv show
1,The Jack Benny Program,Laugh along with funnyman Jack Benny as he bri...,comedy,8.6,comedy,tv show
2,What's My Line?,Four panelists must determine guests' occupati...,"reality, family",8.5,reality,tv show
3,The Avengers,The Avengers is a British television series cr...,"scifi, action, crime, romance, thriller, comedy",8.3,scifi,tv show
4,Mister Rogers' Neighborhood,Mister Rogers' Neighborhood is an American chi...,"family, fantasy, music",8.7,family,tv show


In [42]:
print(f"TV Show Shape: {tv_show_df.shape}")

TV Show Shape: (3482, 6)


In [43]:
# Get only description and category
tv_show_df = tv_show_df.loc[:, ["title", "description", "genres", "category"]]

In [44]:
# Random Shuffle TV Show DataFrame
tv_show_df = tv_show_df.sample(frac = 1, random_state=42).reset_index(drop=True)

In [45]:
tv_show_df.shape

(3482, 4)

In [46]:
get_df_info(tv_show_df)

Shape of the dataframe:  (3482, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Futurama,"Philip J. Fry, a pizza delivery boy, is accide...","animation, comedy, scifi, action",tv show
1,The Wanderer,"In this series, Tripadvisor challenges travele...",documentation,tv show
2,Payback: Money and Power,Follows the revenge story of a group of people...,drama,tv show
3,The Incredible Hulk,The Incredible Hulk is an American animated te...,"action, drama, fantasy, scifi, animation",tv show
4,Once,"The series revolves around Gabo, a soccer-lovi...","comedy, sport, family",tv show


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title           0
description     0
genres         34
category        0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,3482,3482,3448,3482
unique,3482,3482,1346,1
top,Futurama,"Philip J. Fry, a pizza delivery boy, is accide...",documentation,tv show
freq,1,1,221,3482


## Video Game DataFrame

In [47]:
# Video Game DataFrame
video_game_df.head()

Unnamed: 0,title,description,genres,primary_genre,category
0,Galactic Bowling,Galactic Bowling is an exaggerated and stylize...,"Casual,Indie,Sports",Casual,game
1,Train Bandit,THE LAW!! Looks to be a showdown atop a train....,"Action,Indie",Action,game
2,Jolt Project,Jolt Project: The army now has a new robotics ...,"Action,Adventure,Indie,Strategy",Action,game
3,Henosis™,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"Adventure,Casual,Indie",Adventure,game
4,Two Weeks in Painland,ABOUT THE GAME Play as a hacker who has arrang...,"Adventure,Indie",Adventure,game


In [48]:
print(f"Video Game Shape: {video_game_df.shape}")

Video Game Shape: (73123, 5)


In [49]:
# Get only description and category
video_game_df = video_game_df.loc[:, ["title", "description", "genres", "category"]]

In [50]:
# Get Random Slicing Data From Video Game DataFrame
video_game_df = video_game_df.sample(n = slicing, random_state=42).reset_index(drop=True)

In [51]:
video_game_df.shape

(9053, 4)

In [53]:
get_df_info(video_game_df)

Shape of the dataframe:  (9053, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Treasure Girl 3D,“Treasure Girl” is our new game with 2 beautif...,"Casual,Indie,RPG,Simulation",game
1,Mini Car Money Chase,Race for big money while running away from the...,"Action,Racing,Sports",game
2,The Legend of Bum-Bo,'This Bum-bo game! it about time Bum-bo got co...,"Adventure,Indie,Strategy",game
3,Portal,Portal&trade; is a new single player game from...,Action,game
4,Rescue To The PARALLEL,Collect allies and weapons in parallel worlds ...,Action,game


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,9053,9053,9053,9053
unique,9053,9053,846,1
top,Treasure Girl 3D,“Treasure Girl” is our new game with 2 beautif...,"Casual,Indie",game
freq,1,1,529,9053


# Combine All The Dataframes

In [54]:
def get_combined_df(df_list):
    combined_df = pd.concat(df_list, axis=0)
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return combined_df

In [55]:
entertainment_df_list = [anime_df, book_df, movie_df, music_df, tv_show_df, video_game_df]

In [56]:
entertainment_df = get_combined_df(entertainment_df_list)

In [57]:
# Drop Duplicates
entertainment_df = entertainment_df.drop_duplicates(subset="title", keep="first")
entertainment_df = entertainment_df.drop_duplicates(subset="description", keep="first")

In [58]:
get_df_info(entertainment_df)

Shape of the dataframe:  (48120, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Season of Blood: A Rwandan Journey,When President Habyarimana’s jet was shot down...,"Nonfiction, Cultural, Africa, History, Eastern...",book
1,Echo Burning,Hitching rides is an unreliable mode of transp...,"Thriller, Fiction, Mystery, Mystery, Crime",book
2,Yu-Gi-Oh! Arc-V,Yu-Gi-Oh! Arc-V is the fourth main spin-off of...,"action, scifi, comedy, fantasy, animation",tv show
3,Krazanas bude,A social drama about the careless exploitation...,"Drama, Crime",movie
4,Cells at Work Special,"For the regular cells of the human body, life ...",Comedy,anime


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title           0
description     0
genres         34
category        0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,48120,48120,48086,48120
unique,48120,48120,10508,6
top,Season of Blood: A Rwandan Journey,When President Habyarimana’s jet was shot down...,pop,music
freq,1,1,3674,9048


# Remove Language Except English

In [59]:
entertainment_df = entertainment_df[entertainment_df["title"].apply(detect_language)]

In [60]:
entertainment_df = entertainment_df.reset_index(drop=True)

In [61]:
entertainment_df = entertainment_df[entertainment_df["description"].apply(detect_language)]

In [62]:
entertainment_df = entertainment_df.reset_index(drop=True)

In [66]:
entertainment_df =entertainment_df.dropna(axis=0, how="any")

In [67]:
get_df_info(entertainment_df)

Shape of the dataframe:  (24877, 4)
The DataFrame: 


Unnamed: 0,title,description,genres,category
0,Season of Blood: A Rwandan Journey,When President Habyarimana’s jet was shot down...,"Nonfiction, Cultural, Africa, History, Eastern...",book
1,Echo Burning,Hitching rides is an unreliable mode of transp...,"Thriller, Fiction, Mystery, Mystery, Crime",book
2,Cells at Work Special,"For the regular cells of the human body, life ...",Comedy,anime
3,Make Your Move,A pair of star-crossed dancers in New York fin...,"Drama, Musical, Romance",movie
4,Guilty Parade,Guilty Parade is an interactive Visual Novel w...,Adventure,game


All Data Types: 


title          object
description    object
genres         object
category       object
dtype: object

Null Values:


title          0
description    0
genres         0
category       0
dtype: int64

Duplicate Description Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,title,description,genres,category
count,24877,24877,24877,24877
unique,24877,24877,7328,6
top,Season of Blood: A Rwandan Journey,When President Habyarimana’s jet was shot down...,pop,book
freq,1,1,1004,5877


In [68]:
entertainment_df["category"].value_counts()

category
book       5877
anime      5609
game       4628
movie      4215
music      2557
tv show    1991
Name: count, dtype: int64

# DataFrame To CSV

In [69]:
def write_dataframe_to_csv(path, dataframe):
    if os.path.exists(path):
        print(f"The file already exists ...! [Find the file in the location '{path}']")
    else:
        dataframe.to_csv(path, index=False)
        print("Dataframe saved successfully: ", path)

In [70]:
file_path = "../../data/processed/title/title_generation.csv"

In [71]:
write_dataframe_to_csv(file_path, entertainment_df)

Dataframe saved successfully:  ../../data/processed/title/title_generation.csv
