In [31]:
import pandas as pd

# Load the dataset
df = pd.read_csv('anime_dataset.csv')

# Keep only the specified columns
columns_to_keep = ['mal_id', 'title', 'synopsis', 'genres', 'themes']
df = df[columns_to_keep]



# Save the result to a new CSV file
df.to_csv('anime_dataset_filtered.csv', index=False)

In [32]:
# Combine genres, themes, and demographics into one column
df['combined'] = df[['genres', 'themes']].fillna('').apply(lambda x: ', '.join(filter(None, x)), axis=1)

# Remove the original genres, themes, and demographics columns
df.drop(columns=['genres', 'themes'], inplace=True)

# Display the first few rows to verify the result
print(df[['title', 'combined']].head())

                                title                                           combined
0                   Sousou no Frieren                          Adventure, Drama, Fantasy
1    Fullmetal Alchemist: Brotherhood        Action, Adventure, Drama, Fantasy, Military
2                         Steins;Gate  Drama, Sci-Fi, Suspense, Psychological, Time T...
3  Shingeki no Kyojin Season 3 Part 2  Action, Drama, Suspense, Gore, Military, Survival
4                            Gintama°  Action, Comedy, Sci-Fi, Gag Humor, Historical,...


In [33]:
df.to_csv('anime_dataset_filtered_2.csv', index=False)
df.head()

Unnamed: 0,mal_id,title,synopsis,combined
0,52991,Sousou no Frieren,During their decade-long quest to defeat the D...,"Adventure, Drama, Fantasy"
1,5114,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy, Military"
2,9253,Steins;Gate,Eccentric scientist Rintarou Okabe has a never...,"Drama, Sci-Fi, Suspense, Psychological, Time T..."
3,38524,Shingeki no Kyojin Season 3 Part 2,Seeking to restore humanity's diminishing hope...,"Action, Drama, Suspense, Gore, Military, Survival"
4,28977,Gintama°,"Gintoki, Shinpachi, and Kagura return as the f...","Action, Comedy, Sci-Fi, Gag Humor, Historical,..."


In [34]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# List the number of rows with missing values
rows_with_missing_values = df.isnull().any(axis=1).sum()

# List 5 rows with missing values
rows_with_missing_values_list = df[df.isnull().any(axis=1)].head(5)
print("Rows with missing values:\n", rows_with_missing_values_list)
# Remove rows with no synopsis
df.dropna(subset=['synopsis'], inplace=True)

# Verify the result
print("Number of rows after removing missing synopsis:", len(df))

Missing values in each column:
 mal_id         0
title          0
synopsis    4716
combined       0
dtype: int64
Rows with missing values:
      mal_id                                              title synopsis                                           combined
751   58143                          Aishang Ta de Liyou Extra      NaN                                Romance, Adult Cast
779   53499     Aikatsu! 10th Story: Mirai e no Starway (2023)      NaN                              Idols (Female), Music
860   57183                        Wu Dong Qian Kun 4th Season      NaN  Action, Adventure, Fantasy, Historical, Martia...
879   59939                 Tunshi Xingkong Movie: Xueluo Dalu      NaN   Action, Adventure, Fantasy, Sci-Fi, Martial Arts
987   55993  Fanren Xiu Xian Chuan: Xinghai Feichi Pian Xuz...      NaN  Action, Adventure, Fantasy, Historical, Martia...
Number of rows after removing missing synopsis: 16847


In [35]:
df.to_csv('anime_dataset_filtered_3.csv', index=False)

In [36]:
import pandas as pd

# Define a function to determine if a synopsis is too short or uninformative
def is_informative(synopsis, min_length=20):
    return len(synopsis.split()) >= min_length

# Apply the function to filter out uninformative synopses
df = df[df['synopsis'].apply(is_informative)]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)
len(df)

12650

In [37]:

# List of keywords to search for
keywords = ['special', 'OVA', 'season', 'sequel', 'DVD', 'volume', 'movie', 'additional', 'recap', 'prequel', 'prologue', 'epilogue','film']

# Create a regex pattern to match any of the keywords
pattern = '|'.join(keywords)

# Filter rows where the synopsis contains any of the keywords (case-insensitive)
filtered_df = df[df['synopsis'].str.contains(pattern, case=False, na=False)]

# Sort the filtered DataFrame by the length of the synopsis in ascending order
filtered_df = filtered_df.assign(synopsis_length=filtered_df['synopsis'].str.len())
filtered_df = filtered_df.sort_values(by='synopsis_length', ascending=True).drop(columns='synopsis_length')

# Reset index for the filtered DataFrame
filtered_df.reset_index(drop=True, inplace=True)


In [38]:

# Define the lists of mal_id values
keep_first_1000 = {7482, 5648, 27455, 27457, 5925, 16780, 5308, 42296, 10074, 23523, 33176, 58491, 
                   55355, 59722, 53102, 4398, 7547, 30447, 4693, 5080, 34667, 32494, 35616, 51248, 
                   42268, 8670, 60485, 31866, 6380, 43556, 38004, 23637, 57388, 26101, 557, 52553, 
                   29910, 9749, 5096, 30437, 22613, 15883, 5279, 3362, 6713, 2214, 2583, 18429, 1289, 
                   57207, 35409, 51563, 2723, 5620, 13505, 3661, 1314, 40885, 7643, 1031, 5098, 16574, 
                   1938, 1158, 2699, 21101, 1003, 34289, 1772, 17823, 19985, 1078, 8547, 34550, 810, 
                   3077, 18177, 3640, 1155, 35320, 45654, 7575, 4073, 3900, 51782, 11299, 53045, 
                   45596, 30278, 34915, 15937, 1839, 39360, 42899, 675, 3624, 13409, 2948, 10790, 
                   2110, 16556, 8812, 760, 21957, 6023, 2969, 1988, 38117, 51125, 34834, 3247, 4418, 
                   4383, 20499, 31670, 33490, 28869}

remove_after_1000 = {6489, 34258, 41822, 24527, 52003, 20649, 3671, 1933, 42174, 5755, 5458, 37753, 
                     39416, 32026, 1112, 32376, 33148, 25305, 10979, 31678, 38739, 5036, 4094, 38895, 
                     39762, 26189, 5501, 21195, 35317, 30796, 30818, 5396}

# Split the DataFrame into the first 1000 rows and the rest
df_first_1000 = filtered_df.iloc[:1000]
df_rest = filtered_df.iloc[1000:]

# Filter the first 1000 rows to keep only the specified mal_id values
df_first_1000 = df_first_1000[df_first_1000['mal_id'].isin(keep_first_1000)]

# Filter the rest to remove the specified mal_id values
df_rest = df_rest[~df_rest['mal_id'].isin(remove_after_1000)]

# Concatenate both filtered DataFrames
df_filtered = pd.concat([df_first_1000, df_rest], ignore_index=True)

# Reset index for cleanliness
df_filtered.reset_index(drop=True, inplace=True)

# Display the new DataFrame
df_filtered.head()


Unnamed: 0,mal_id,title,synopsis,combined
0,27455,Porong Porong Pororo 3,A direct continuation of the second season. Th...,Sci-Fi
1,7482,"Yume kara, Samenai",Student Takao is inexorably drawn toward Sao d...,"Romance, School"
2,5648,Bouken Shite mo Ii Koro,Follow a cast that makes pornographic movies. ...,"Comedy, Romance, Ecchi, Adult Cast, Showbiz"
3,27457,Porong Porong Pororo 4,A direct continuation of the previous season w...,Racing
4,5925,Neko Hiki no Oruorane,"During the holiday season, a young man encount...","Fantasy, Music, Pets"


In [39]:
# Remove rows of filtered_df from df based on mal_id
df = df[~df['mal_id'].isin(filtered_df['mal_id'])]

# Add the rows of df_filtered to df
df = pd.concat([df, df_filtered], ignore_index=True)

# Reset index for cleanliness
df.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
df.head()

Unnamed: 0,mal_id,title,synopsis,combined
0,52991,Sousou no Frieren,During their decade-long quest to defeat the D...,"Adventure, Drama, Fantasy"
1,5114,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy, Military"
2,39486,Gintama: The Final,Two years have passed following the Tendoshuu'...,"Action, Comedy, Drama, Sci-Fi, Gag Humor, Hist..."
3,11061,Hunter x Hunter (2011),Hunters devote themselves to accomplishing haz...,"Action, Adventure, Fantasy"
4,9969,Gintama',"After a one-year hiatus, Shinpachi Shimura ret...","Action, Comedy, Sci-Fi, Gag Humor, Historical,..."


In [40]:
import re

# Function to clean synopsis text
def clean_synopsis(text):
    if pd.isna(text):  # Handle NaN values
        return text
    return re.sub(r'\(Source:.*?\)|\[Written by .*?\]', '', text).strip()

# Apply the function to clean the synopsis column
df['synopsis'] = df['synopsis'].apply(clean_synopsis)

# Display the cleaned DataFrame
df.head()


Unnamed: 0,mal_id,title,synopsis,combined
0,52991,Sousou no Frieren,During their decade-long quest to defeat the D...,"Adventure, Drama, Fantasy"
1,5114,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy, Military"
2,39486,Gintama: The Final,Two years have passed following the Tendoshuu'...,"Action, Comedy, Drama, Sci-Fi, Gag Humor, Hist..."
3,11061,Hunter x Hunter (2011),Hunters devote themselves to accomplishing haz...,"Action, Adventure, Fantasy"
4,9969,Gintama',"After a one-year hiatus, Shinpachi Shimura ret...","Action, Comedy, Sci-Fi, Gag Humor, Historical,..."


In [41]:
# Split the genre strings by commas, flatten the list, and get unique values
unique_words = set(df['combined'].dropna().str.split(', ').explode())

# Convert to a sorted list for better readability
unique_words = sorted(unique_words)

# Display the unique words and their frequency
word_freq = df['combined'].str.split(', ').explode().value_counts()
print("Unique words and their frequency:\n", word_freq)
word_df = pd.DataFrame(word_freq).reset_index()
word_df.columns = ['word', 'frequency']
word_df.head(10)


Unique words and their frequency:
 combined
Comedy               3958
Action               3628
Fantasy              2913
Adventure            2593
Sci-Fi               2259
Drama                2078
Romance              1583
School               1481
Supernatural          950
Historical            891
Slice of Life         861
Mecha                 853
Mystery               689
Ecchi                 553
Music                 545
Military              496
Adult Cast            487
Super Power           479
Sports                469
Martial Arts          409
Mythology             404
Parody                401
Space                 395
Horror                384
Suspense              362
Psychological         351
Harem                 335
Avant Garde           317
Isekai                302
Anthropomorphic       296
Mahou Shoujo          234
                      230
Award Winning         229
Detective             226
Gore                  222
Strategy Game         213
Team Sports         

Unnamed: 0,word,frequency
0,Comedy,3958
1,Action,3628
2,Fantasy,2913
3,Adventure,2593
4,Sci-Fi,2259
5,Drama,2078
6,Romance,1583
7,School,1481
8,Supernatural,950
9,Historical,891


In [43]:
# Define the set of allowed categories
allowed_categories = {"Action", "Adventure", "Comedy", "Drama", "Ecchi", 
                      "Fantasy", "Historical", 
                      "Mecha", "Music", "Mystery", 
                      "Romance", "School", "Sci-Fi", "Slice of Life", "Supernatural"}

# Function to filter genres in the 'combined' column
def filter_genres(genre_string):
    if pd.isna(genre_string):  # Handle NaN values
        return genre_string
    filtered_genres = [genre for genre in genre_string.split(', ') if genre in allowed_categories]
    return ', '.join(filtered_genres) if filtered_genres else None  # Convert empty results to NaN

# Apply the filtering function
df['combined'] = df['combined'].apply(filter_genres)

# Remove rows where 'combined' is now empty (i.e., all genres were removed)
df.dropna(subset=['combined'], inplace=True)

# Reset index for cleanliness
df.reset_index(drop=True, inplace=True)

# Display the updated DataFrame

df.shape
df.head()


Unnamed: 0,mal_id,title,synopsis,combined
0,52991,Sousou no Frieren,During their decade-long quest to defeat the D...,"Adventure, Drama, Fantasy"
1,5114,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy"
2,39486,Gintama: The Final,Two years have passed following the Tendoshuu'...,"Action, Comedy, Drama, Sci-Fi, Historical"
3,11061,Hunter x Hunter (2011),Hunters devote themselves to accomplishing haz...,"Action, Adventure, Fantasy"
4,9969,Gintama',"After a one-year hiatus, Shinpachi Shimura ret...","Action, Comedy, Sci-Fi, Historical"


In [44]:
# Simplifying genre names to English
genre_translation = {
    "Ecchi": "Lewd",
    "Mecha": "Fighting Robots",
    "Sci-Fi": "Science Fiction",
    "Slice of Life": "Everyday Life"

}

# Function to translate genres in a list
def translate_genres(genre_list):
    return [genre_translation.get(g, g) for g in genre_list]  # Replace if in dict, else keep as is

# Apply translation to the 'combined' column
df["combined"] = df["combined"].apply(lambda x: ", ".join(translate_genres(x.split(", "))))


In [46]:
df.to_csv('anime_dataset_cleaned.csv', index=False)

In [47]:
import pandas as pd

df = pd.read_csv("anime_dataset_cleaned.csv")

# Trim whitespace
df['synopsis'] = df['synopsis'].str.strip()

# Check for empty strings
empty_synopsis = df[df['synopsis'] == '']
print("Rows with empty synopses:", empty_synopsis)

# Find rows where synopses are duplicated but titles are different
duplicated_synopses = df[df.duplicated(subset='synopsis', keep=False)]
print("Number of duplicated synopses:", len(duplicated_synopses))
non_matching = duplicated_synopses.groupby('synopsis')['title'].nunique()
print(non_matching[non_matching > 1])

Rows with empty synopses: Empty DataFrame
Columns: [mal_id, title, synopsis, combined]
Index: []
Number of duplicated synopses: 198
synopsis
"Unbelievable" Zorori is a prankster fox determined to become the Prince of Mischief, with his very own castle and beautiful bride. While is notorious for his prank-making schemes, his ill-meaning plots often backfire against him, and can usually end up even benefiting or cheering up his targets, much to his dismay. Nevertheless, Zorori has the wits and intelligence to wriggle out of the tightest of all spots, and journeys with a pair of twin boars, Ishishi and Noshishi, in order to fulfill his long wished-for goal and at last prove to his deceased mother, Mama Zorori that he truly holds the potential to achieve his dreams.                                                                                                                                                                                                                                    

In [48]:
df['synopsis'] = df['synopsis'].str.strip()

duplicated_synopses = df[df.duplicated(subset='synopsis', keep=False)]
print(duplicated_synopses.sort_values(by='synopsis'))


      mal_id                                              title                                           synopsis                                   combined
3560   59730  A-Rank Party wo Ridatsu shita Ore wa, Moto Osh...  "I can't do this anymore!" Yuke Feldio, a red ...                 Action, Adventure, Fantasy
3561   59730  A-Rank Party wo Ridatsu shita Ore wa, Moto Osh...  "I can't do this anymore!" Yuke Feldio, a red ...                 Action, Adventure, Fantasy
7953   60168                    Watashi wo Tabetai, Hitodenashi  "I've come to eat you," so softly utters the m...                        Drama, Supernatural
7954   60168                    Watashi wo Tabetai, Hitodenashi  "I've come to eat you," so softly utters the m...                        Drama, Supernatural
7009   59572                        Chogokin Gundam★Hello Kitty  "Mobile Suit Gundam" and "Hello Kitty" are "su...           Science Fiction, Fighting Robots
...      ...                                        

In [49]:
df['synopsis'] = df['synopsis'].str.strip()

df = df.drop_duplicates()

print(f"New number of rows: {len(df)}")
print(f"Number of unique titles: {df['title'].nunique()}")
print(f"Number of unique synopses: {df['synopsis'].nunique()}")

New number of rows: 10779
Number of unique titles: 10779
Number of unique synopses: 10708


In [50]:
df['synopsis'] = df['synopsis'].str.strip()
df = df.drop_duplicates()

duplicated_synopses = df[df.duplicated(subset='synopsis', keep=False)]
print(duplicated_synopses.sort_values(by='synopsis'))

      mal_id                                              title                                           synopsis                                    combined
5856    6965  Kaiketsu Zorori: Mahoutsukai no Deshi/Dai Kaiz...  "Unbelievable" Zorori is a prankster fox deter...                           Adventure, Comedy
4795    3659  Majime ni Fumajime Kaiketsu Zorori: Nazo no Ot...  "Unbelievable" Zorori is a prankster fox deter...                           Adventure, Comedy
8679   33753                          Tian Xin Ge Ge 2nd Season  A Chinese prince meets a regular civilian duri...                          Comedy, Historical
4847   33739                                     Tian Xin Ge Ge  A Chinese prince meets a regular civilian duri...                          Comedy, Historical
8416   44863                                   Mohuan Xian Zong  A baby princess named Ocean is sent away from ...                          Adventure, Fantasy
...      ...                                  

In [51]:
df['synopsis'] = df['synopsis'].str.strip()

# Remove duplicate synopses, keeping the first occurrence
df = df.drop_duplicates(subset='synopsis', keep='first')

print(f"New number of rows: {len(df)}")
print(f"Number of unique titles: {df['title'].nunique()}")
print(f"Number of unique synopses: {df['synopsis'].nunique()}")

# Verify that the duplicates are gone
duplicated_synopses = df[df.duplicated(subset='synopsis', keep=False)]
print(f"Number of remaining duplicated synopses: {len(duplicated_synopses)}")

New number of rows: 10708
Number of unique titles: 10708
Number of unique synopses: 10708
Number of remaining duplicated synopses: 0


In [52]:
import pandas as pd


file_path = 'anime_dataset_cleaned.csv'


# Set display options for better output
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

print(f"Attempting to load CSV file: {file_path}\n")

try:
    # Load the dataset
    df = pd.read_csv(file_path)

    print("--- Successfully loaded CSV file ---\n")

    # 1. Column Names (Features)
    print("--- 1. Column Names ---")
    column_names = df.columns.tolist()
    print(column_names)
    print("\n>>> Please identify which of the above columns is your TARGET variable <<<\n")

    # 2. Data Types of Each Column
    print("--- 2. Data Types ---")
    print(df.dtypes)
    print("\n")

    # 3. Data Content Snippet (First 10 Rows)
    print("--- 3. First 10 Rows (Potential Anonymization Needed!) ---")
    print(">>> IMPORTANT: Review this section for sensitive data before sharing! <<<")
    print(df.head(10))
    print("\n")

    # 4. Dataset Size (Shape)
    print("--- 4. Dataset Size ---")
    print(f"Number of Rows:    {df.shape[0]}")
    print(f"Number of Columns: {df.shape[1]}")
    print("\n")

    # 5. Missing Values
    print("--- 5. Missing Value Counts ---")
    missing_counts = df.isnull().sum()
    missing_counts = missing_counts[missing_counts > 0] # Filter only columns with missing values
    if missing_counts.empty:
        print("No missing values found.")
    else:
        print("Columns with missing values and their counts:")
        print(missing_counts)
    print("\n")

    # 6. Categorical Feature Details (Heuristic based on dtype and unique values)
    print("--- 6. Unique Value Counts for Potential Categorical Columns ---")
    print("(Includes 'object' dtype and numerical columns with <= 20 unique values)\n")
    potential_categorical_cols = []
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            potential_categorical_cols.append(col)
        elif pd.api.types.is_numeric_dtype(df[col].dtype) and df[col].nunique() <= 20:
             potential_categorical_cols.append(col)
             print(f"(Note: Numerical column '{col}' included as potentially categorical due to low unique count)")


    if not potential_categorical_cols:
        print("No columns automatically identified as likely categorical (dtype 'object' or low-unique numerical).")
    else:
        print(f"{'Column Name':<30} | {'Num Unique Values':<20} | {'Unique Values (if <= 15)'}")
        print("-" * 80)
        for col in potential_categorical_cols:
            num_unique = df[col].nunique()
            unique_values_str = ""
            if num_unique <= 15:
                # Convert to list and handle potential NaN values for display
                unique_values = [str(item) for item in df[col].unique().tolist()]
                unique_values_str = ", ".join(unique_values)

            print(f"{col:<30} | {num_unique:<20} | {unique_values_str}")

    print("\n--- End of Analysis ---")

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please check the file path and name, then try again.")
except Exception as e:
    print(f"An unexpected error occurred while processing the file: {e}")
    print("There might be an issue with the file format or content.")

Attempting to load CSV file: anime_dataset_cleaned.csv

--- Successfully loaded CSV file ---

--- 1. Column Names ---
['mal_id', 'title', 'synopsis', 'combined']

>>> Please identify which of the above columns is your TARGET variable <<<

--- 2. Data Types ---
mal_id       int64
title       object
synopsis    object
combined    object
dtype: object


--- 3. First 10 Rows (Potential Anonymization Needed!) ---
>>> IMPORTANT: Review this section for sensitive data before sharing! <<<
   mal_id                                       title                                           synopsis                                           combined
0   52991                           Sousou no Frieren  During their decade-long quest to defeat the D...                          Adventure, Drama, Fantasy
1    5114            Fullmetal Alchemist: Brotherhood  After a horrific alchemy experiment goes wrong...                  Action, Adventure, Drama, Fantasy
2   39486                          Gintama: Th

In [53]:
df.to_csv("anime_dataset_cleaned.csv", index=False)
print("Duplicates removed and file saved as 'anime_dataset_cleaned.csv'")

Duplicates removed and file saved as 'anime_dataset_cleaned.csv'
