In [59]:
import pandas as pd
import io


def read_and_head(csv_path):
    all_scifi_genres = pd.read_csv(csv_path)
    print(f"Loaded all_scifi_genres with shape: {all_scifi_genres.shape}")
    return all_scifi_genres.head()

In [60]:
import re
import unicodedata


def read_csv_robust(csv_path):
    encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
    last_error = None
    for enc in encodings_to_try:
        try:
            return pd.read_csv(csv_path, encoding=enc, engine="python", on_bad_lines="skip")
        except Exception as e:
            last_error = e
    # Fallback with replacement
    for enc in encodings_to_try:
        try:
            with open(csv_path, "r", encoding=enc, errors="replace") as f:
                return pd.read_csv(f, engine="python", on_bad_lines="skip")
        except Exception as e:
            last_error = e
    raise last_error


def normalize_ascii(text: str) -> str:
    # Normalize unicode to ASCII where possible
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    normalized = unicodedata.normalize("NFKD", text)
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
    return ascii_text


def clean_title(text: str) -> str:
    ascii_text = normalize_ascii(text).lower()
    # keep only [a-z0-9], drop everything else (including spaces and punctuation)
    return re.sub(r"[^a-z0-9]", "", ascii_text)


csv_path = "/Users/michaelcai/Desktop/fall25/cs/scifitropes-dataset/altered_data/all_scifi_genres.csv"
df_all = read_csv_robust(csv_path)

# Expect column name 'title'; adjust if needed
if "title" not in df_all.columns:
    raise KeyError(f"Expected a 'title' column, found: {list(df_all.columns)}")

cleaned = df_all["title"].apply(clean_title)
cleaned_df = pd.DataFrame({"title_clean": cleaned})

print(f"Created cleaned_df with {len(cleaned_df)} titles.")
cleaned_df.head()


Created cleaned_df with 6113 titles.


Unnamed: 0,title_clean
0,akb0048
1,doraemontherecordofnobitaspaceblazer
2,doraemonnobitasdriftsintheuniverse
3,thesaiyansaga
4,gankutsuou


In [61]:
base_dir = "/Users/michaelcai/Desktop/fall25/cs/scifitropes-dataset/altered_data"

files = {
    "games": f"{base_dir}/Games_exclusive.csv",
    "anime": f"{base_dir}/Anime_exclusive.csv",
    "films": f"{base_dir}/Films_exclusive.csv",
}

cleaned_results = {}
for key, path in files.items():
    df = read_csv_robust(path)
    title_type = df.columns[0]
    cleaned = df[title_type].apply(clean_title)
    cleaned_results[key] = pd.DataFrame({"title_clean": cleaned})

games_cleaned_df = cleaned_results["games"]
anime_cleaned_df = cleaned_results["anime"]
films_cleaned_df = cleaned_results["films"]

print(
    "Created DataFrames:",
    {k: v.shape for k, v in {
        "games_cleaned_df": games_cleaned_df,
        "anime_cleaned_df": anime_cleaned_df,
        "films_cleaned_df": films_cleaned_df,
    }.items()},
)

# Previews
print("Games preview:")
games_cleaned_df.head()



Created DataFrames: {'games_cleaned_df': (1675, 1), 'anime_cleaned_df': (397, 1), 'films_cleaned_df': (6894, 1)}
Games preview:


Unnamed: 0,title_clean
0,falloutnewvegas
1,fallout4
2,starwarstheoldrepublic
3,masseffect2
4,masseffect3


In [62]:
anime_cleaned_df.head()

Unnamed: 0,title_clean
0,gundam
1,8thman
2,activeraid
3,afterwargundamx
4,aicoincarnation


In [63]:
films_cleaned_df.head()

Unnamed: 0,title_clean
0,0091
1,10abootstomping20ahumanface30goto10
2,10cloverfieldlane
3,112263
4,12monkeys


In [64]:
scifimedia_titles = cleaned_df

In [65]:
# Build a cleaned set from scifimedia_titles, then filter the cleaned DataFrames
from collections.abc import Iterable


def to_clean_title_set(obj) -> set:
    # Accepts DataFrame/Series/list/set of titles or a DataFrame with 'title'/'title_clean'
    try:
        import pandas as _pd
    except Exception:
        _pd = pd

    # DataFrame case
    if hasattr(obj, "__class__") and obj.__class__.__name__ in ("DataFrame",):
        cols = [c.lower() for c in obj.columns]
        if "title_clean" in cols:
            return set(obj[obj.columns[cols.index("title_clean")]].dropna().astype(str))
        if "title" in cols:
            series = obj[obj.columns[cols.index("title")]].astype(str)
            return set(series.map(clean_title))
        # Fallback: use first column
        series = obj[obj.columns[0]].astype(str)
        return set(series.map(clean_title))

    # Series-like
    if hasattr(obj, "__class__") and obj.__class__.__name__ in ("Series",):
        series = obj.astype(str)
        return set(series.map(clean_title))

    # Iterable of strings
    if isinstance(obj, Iterable) and not isinstance(obj, (str, bytes)):
        return set(clean_title(x) for x in obj)

    # Single string fallback
    if isinstance(obj, (str, bytes)):
        return {clean_title(obj)}

    raise TypeError("Unsupported type for scifimedia_titles")


# Expect scifimedia_titles to be defined earlier in the notebook
scifimedia_titles_clean = to_clean_title_set(scifimedia_titles)

anime_filtered_df = anime_cleaned_df[anime_cleaned_df["title_clean"].isin(scifimedia_titles_clean)].reset_index(drop=True)
games_filtered_df = games_cleaned_df[games_cleaned_df["title_clean"].isin(scifimedia_titles_clean)].reset_index(drop=True)
films_filtered_df = films_cleaned_df[films_cleaned_df["title_clean"].isin(scifimedia_titles_clean)].reset_index(drop=True)

print({
    "anime_filtered_df": anime_filtered_df.shape,
    "games_filtered_df": games_filtered_df.shape,
    "films_filtered_df": films_filtered_df.shape,
})

anime_filtered_df.head()


{'anime_filtered_df': (262, 1), 'games_filtered_df': (592, 1), 'films_filtered_df': (2226, 1)}


Unnamed: 0,title_clean
0,gundam
1,8thman
2,activeraid
3,afterwargundamx
4,akb0048


In [66]:
# games_filtered_df
# anime_filtered_df
# films_filtered_df

In [67]:
read_and_head("scifianimedb_with_years.csv")

Loaded all_scifi_genres with shape: (397, 5)


Unnamed: 0,Title,Tropes Count,URL,Tropes,Release Year
0,Dragon Ball Super,1408,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Landline Eavesdropping':'https://tvtropes.or...,2017.0
1,Dragon Ball Z,1248,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Schrodinger’s Questions':'https://tvtropes.o...,1989.0
2,Mazinger Z,904,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Girly Skirt Twirl':'https://tvtropes.org/pmw...,1972.0
3,Darker than Black,703,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,'The Worf Effect':'https://tvtropes.org/pmwiki...,2007.0
4,Tengen Toppa Gurren Lagann,650,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",2007.0


In [68]:

# Load the three "with_years" CSVs and add cleaned title columns
base_dir = "/Users/michaelcai/Desktop/fall25/cs/scifitropes-dataset/altered_data"

files_with_years = {
    "anime": f"{base_dir}/scifianimedb_with_years.csv",
    "films": f"{base_dir}/scififilmdb_with_years.csv", 
    "games": f"{base_dir}/scifivideogamesdb_with_years.csv",
}

dfs_with_years = {}
for key, path in files_with_years.items():
    df = read_csv_robust(path)
    # Add cleaned title column
    df["title_clean"] = df["Title"].apply(clean_title)
    dfs_with_years[key] = df

scifianimedb_with_years = dfs_with_years["anime"]
scififilmdb_with_years = dfs_with_years["films"]
scifivideogamesdb_with_years = dfs_with_years["games"]

print("Loaded DataFrames with cleaned titles:")
for name, df in dfs_with_years.items():
    print(f"{name}: {df.shape}")

scifianimedb_with_years.head()

Loaded DataFrames with cleaned titles:
anime: (397, 6)
films: (1379, 6)
games: (1672, 6)


Unnamed: 0,Title,Tropes Count,URL,Tropes,Release Year,title_clean
0,Dragon Ball Super,1408,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Landline Eavesdropping':'https://tvtropes.or...,2017.0,dragonballsuper
1,Dragon Ball Z,1248,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Schrodinger’s Questions':'https://tvtropes.o...,1989.0,dragonballz
2,Mazinger Z,904,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Girly Skirt Twirl':'https://tvtropes.org/pmw...,1972.0,mazingerz
3,Darker than Black,703,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,'The Worf Effect':'https://tvtropes.org/pmwiki...,2007.0,darkerthanblack
4,Tengen Toppa Gurren Lagann,650,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",2007.0,tengentoppagurrenlagann


In [69]:
# Filter the "with_years" DataFrames using the corresponding filtered DataFrames
anime_clean_set = set(anime_filtered_df["title_clean"])
films_clean_set = set(films_filtered_df["title_clean"])
games_clean_set = set(games_filtered_df["title_clean"])

# Filter each "with_years" DataFrame
scifianimedb_with_years_filtered = scifianimedb_with_years[
    scifianimedb_with_years["title_clean"].isin(anime_clean_set)
].reset_index(drop=True)

scififilmdb_with_years_filtered = scififilmdb_with_years[
    scififilmdb_with_years["title_clean"].isin(films_clean_set)
].reset_index(drop=True)

scifivideogamesdb_with_years_filtered = scifivideogamesdb_with_years[
    scifivideogamesdb_with_years["title_clean"].isin(games_clean_set)
].reset_index(drop=True)

print("Filtered 'with_years' DataFrames:")
print(f"scifianimedb_with_years_filtered: {scifianimedb_with_years_filtered.shape}")
print(f"scififilmdb_with_years_filtered: {scififilmdb_with_years_filtered.shape}")
print(f"scifivideogamesdb_with_years_filtered: {scifivideogamesdb_with_years_filtered.shape}")

scifianimedb_with_years_filtered.head()

Filtered 'with_years' DataFrames:
scifianimedb_with_years_filtered: (262, 6)
scififilmdb_with_years_filtered: (723, 6)
scifivideogamesdb_with_years_filtered: (590, 6)


Unnamed: 0,Title,Tropes Count,URL,Tropes,Release Year,title_clean
0,Dragon Ball Z,1248,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Schrodinger’s Questions':'https://tvtropes.o...,1989.0,dragonballz
1,Mazinger Z,904,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Girly Skirt Twirl':'https://tvtropes.org/pmw...,1972.0,mazingerz
2,Darker than Black,703,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,'The Worf Effect':'https://tvtropes.org/pmwiki...,2007.0,darkerthanblack
3,Tengen Toppa Gurren Lagann,650,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",2007.0,tengentoppagurrenlagann
4,Cowboy Bebop,615,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",1998.0,cowboybebop


In [70]:
print(len(games_clean_set))
print(len(films_clean_set))
print(len(anime_clean_set))

578
2225
262


In [71]:
scifivideogamesdb_with_years

Unnamed: 0,Title,Tropes Count,URL,Tropes,Release Year,title_clean
0,Fallout: New Vegas,1364,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{'Food Chain of Evil':'https://tvtropes.org/pm...,2010.0,falloutnewvegas
1,Fallout 4,1214,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{'Mary Sue Classic':'https://tvtropes.org/pmwi...,2015.0,fallout4
2,Star Wars: The Old Republic,1190,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{'The Immodest Orgasm':'https://tvtropes.org/p...,2011.0,starwarstheoldrepublic
3,Mass Effect 2,1123,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{'I Can’t Hear You':'https://tvtropes.org/pmwi...,2010.0,masseffect2
4,Mass Effect 3,1113,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{'Decoy Convoy':'https://tvtropes.org/pmwiki/p...,2012.0,masseffect3
...,...,...,...,...,...,...
1667,Star Fox,Franchise redirect,https://tvtropes.org/pmwiki/pmwiki.php/Franchi...,,1993.0,starfox
1668,Doctor Who,complicated,https://www.tvtropes.org/pmwiki/pmwiki.php/Vid...,{},2021.0,doctorwho
1669,Gundam Battle: Gunpla Warfare,aka issue,https://tvtropes.org/pmwiki/pmwiki.php/VideoGa...,,2019.0,gundambattlegunplawarfare
1670,inFAMOUS: Festival of Blood,aka issue,https://tvtropes.org/pmwiki/pmwiki.php/VideoGa...,,2011.0,infamousfestivalofblood


In [72]:
games_clean_set

{'1213',
 '13sentinelsaegisrim',
 'abuse',
 'achron',
 'adventrising',
 'adventuresoftron',
 'aithesomniumfiles',
 'aiwarfleetcommand',
 'albion',
 'alienblackout',
 'alienisolation',
 'alienresurrection',
 'alienscolonialmarines',
 'aliensdarkdescent',
 'aliensfireteamelite',
 'aliensinfestation',
 'alienswarm',
 'alientrilogy',
 'alltynexsecond',
 'amongus',
 'androidhuntera',
 'annomutationem',
 'antarctica88',
 'aresextinctionagenda',
 'armoredcore',
 'armorinesprojectswarm',
 'artemisspaceshipbridgesimulator',
 'assaultretribution',
 'astrobotrescuemission',
 'asuraswrath',
 'atomicrobokid',
 'attackoftheearthlings',
 'avirusnamedtom',
 'azurestrikergunvolt',
 'azurestrikergunvolt2',
 'azurestrikergunvoltseries',
 'barotrauma',
 'battlefield2142',
 'beneathasteelsky',
 'beyondsunset',
 'binarydomain',
 'biomutant',
 'bioshock',
 'bioshock2',
 'bioshockinfinite',
 'blacksnow',
 'bloodnet',
 'bloodyzombies',
 'blueplanet',
 'borderlands',
 'borderlands2',
 'borderlands3',
 'borderla

In [73]:
# Write out scifianimedb_with_years_filtered, scififilmdb_with_years_filtered, scifivideogamesdb_with_years_filtered to csv
scifianimedb_with_years_filtered.to_csv("scifianimedb_with_years_filtered.csv", index=False)
scififilmdb_with_years_filtered.to_csv("scififilmdb_with_years_filtered.csv", index=False)
scifivideogamesdb_with_years_filtered.to_csv("scifivideogamesdb_with_years_filtered.csv", index=False)


In [74]:
read_and_head("scifianimedb_with_years_filtered.csv")

Loaded all_scifi_genres with shape: (262, 6)


Unnamed: 0,Title,Tropes Count,URL,Tropes,Release Year,title_clean
0,Dragon Ball Z,1248,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Schrodinger’s Questions':'https://tvtropes.o...,1989.0,dragonballz
1,Mazinger Z,904,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,{'Girly Skirt Twirl':'https://tvtropes.org/pmw...,1972.0,mazingerz
2,Darker than Black,703,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,'The Worf Effect':'https://tvtropes.org/pmwiki...,2007.0,darkerthanblack
3,Tengen Toppa Gurren Lagann,650,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",2007.0,tengentoppagurrenlagann
4,Cowboy Bebop,615,https://www.tvtropes.org/pmwiki/pmwiki.php/Ani...,"{'2D Visuals, 3D Effects': 'https://www.tvtrop...",1998.0,cowboybebop
