In [2]:
import numpy as np 
import os 
import pandas as pd 
import re
import string
import seaborn as sns
from rake_nltk import Rake
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack

In [3]:
df_anime = pd.read_csv("../data/df_anime_export.csv")

# View Resetter

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [5]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

In [6]:
synopsis = df_anime['synopsis_processed']
genre = df_anime['genre']
title = df_anime['title']

# Cleaning Anime_Df before being used

In [7]:
for idx, genre in enumerate(df_anime['genre']):
    df_anime.at[idx, 'genre'] = genre.replace('Martial Arts', 'Martial_Arts')
    df_anime.at[idx, 'genre'] = genre.replace('Super Power', 'Super_Power')
    df_anime.at[idx, 'genre'] = genre.replace('Slice of Life', 'Slice_of_Life')

In [8]:
#clean genre so it can be used in the rec system
for idx, row_genre in enumerate(df_anime['genre']):
    row_genre = row_genre.replace("'",'')
    row_genre = row_genre.replace(",",'')
    row_genre = row_genre.replace("]","")
    row_genre = row_genre.replace("[","")
    df_anime['genre'].iloc[idx] = row_genre

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anime['genre'].iloc[idx] = row_genre


In [9]:
# Calculate the minimum and maximum values of the 'popularity' column
min_popularity = df_anime['popularity'].min()
max_popularity = df_anime['popularity'].max()

# Perform min-max scaling on the 'popularity' column
df_anime['popularity_normalized'] = (df_anime['popularity'] - min_popularity) / (max_popularity - min_popularity)

In [10]:
df_anime.dropna(inplace = True)
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16207 entries, 0 to 16213
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             16207 non-null  int64  
 1   uid                    16207 non-null  int64  
 2   title                  16207 non-null  object 
 3   synopsis               16207 non-null  object 
 4   genre                  16207 non-null  object 
 5   aired                  16207 non-null  object 
 6   episodes               16207 non-null  float64
 7   members                16207 non-null  int64  
 8   popularity             16207 non-null  int64  
 9   ranked                 16207 non-null  float64
 10  score                  16207 non-null  float64
 11  synopsis_processed     16207 non-null  object 
 12  popularity_normalized  16207 non-null  float64
dtypes: float64(4), int64(4), object(5)
memory usage: 1.7+ MB


# List of recommender functions

In [11]:
# Regular recommender, returns similar titles
def recommend_reg(show_title, n_recom, vectorized_bag_of_words, title, df):
    similarity_matrix = cosine_similarity(vectorized_bag_of_words, vectorized_bag_of_words[list(np.where(title == show_title)[0]), :])
    similarity_dataframe = pd.DataFrame(similarity_matrix)
    similarity_dataframe.index = title
    similarity_dataframe = similarity_dataframe.iloc[:, 0]

    # Calculate the popularity weights
    max_popularity = df['popularity_normalized'].max()
    popularity_weights = (max_popularity - df['popularity_normalized']) / max_popularity

    # Multiply the similarity scores by popularity weights
    weighted_similarity = similarity_dataframe.values * popularity_weights.values

    similarity_dataframe = pd.Series(weighted_similarity, index=similarity_dataframe.index)
    similarity_dataframe = similarity_dataframe.sort_values(ascending=False)

    # Exclude the show_title from the recommended titles
    recommended_titles = list(similarity_dataframe.index)
    recommended_titles = [title for title in recommended_titles if title != show_title]

    return recommended_titles[:n_recom]

In [12]:
# Removes titles similar
def recommend_remove_sim(show_title, n_recom, vectorized_bag_of_words, title, df):
    show_indices = np.where(title == show_title)[0]
    if len(show_indices) == 0:
        print(f"Show title '{show_title}' not found in the DataFrame.")
        return []  # Return an empty list if the show title is not found

    similarity_matrix = cosine_similarity(vectorized_bag_of_words, vectorized_bag_of_words[list(np.where(title == show_title)[0]), :])
    similarity_dataframe = pd.DataFrame(similarity_matrix)
    similarity_dataframe.index = title
    similarity_dataframe = similarity_dataframe.iloc[:, [0]]

    # Calculate the popularity weights
    max_popularity = df['popularity_normalized'].max()
    popularity_weights = (1 - df['popularity_normalized']) / max_popularity

    # Multiply the similarity scores by popularity weights
    similarity_dataframe *= popularity_weights

    similarity_dataframe = similarity_dataframe.sort_values(by=0, ascending=False)
    similarity_dataframe = similarity_dataframe.drop_duplicates()

    recommended_titles = list(similarity_dataframe.index)
    recommended_titles = [title for title in recommended_titles if show_title not in title]  # Remove titles containing the show_title input
    #recommended_titles = [title for title in recommended_titles if show_title not in title or show_title == title]

    return recommended_titles[:n_recom]

In [13]:
# Considers ALL titles similar to input + popularity
def recommend_all_sim(show_title, n_recom, vectorized_bag_of_words, title, df):
    show_indices = np.where(title == show_title)[0]
    if len(show_indices) == 0:
        print(f"Show title '{show_title}' not found in the DataFrame.")
        return []  # Return an empty list if the show title is not found

    similarity_matrix = cosine_similarity(vectorized_bag_of_words, vectorized_bag_of_words[list(np.where(title == show_title)[0]), :])
    similarity_dataframe = pd.DataFrame(similarity_matrix)
    similarity_dataframe.index = title

    similarity_dataframe = similarity_dataframe.iloc[:, [0]]

    # Calculate the popularity weights
    max_popularity = df['popularity_normalized'].max()
    
    popularity_weights = (1 - df['popularity_normalized']) / max_popularity

    #print(popularity_weights)

    # Multiply the similarity scores by popularity weights
    similarity_dataframe *= popularity_weights

    similarity_dataframe = similarity_dataframe.sort_values(by = 0, ascending=False)
    similarity_dataframe = similarity_dataframe.drop_duplicates()

    # Find similar titles using fuzzy matching
    similar_titles = []
    for title in similarity_dataframe.index:
        if fuzz.partial_ratio(show_title, title) >= 30:  # Set a threshold for similarity
            if show_title not in title:
                similar_titles.append(title)

    return similar_titles[:n_recom]

In [14]:
def vectorize_genre_and_words(genre_column, synopsis, genre_weight=2.0, words_weight=1.0):
    #extracting keywords for recommender
    rake = Rake()
    words = []
    for plot in synopsis:
        rake.extract_keywords_from_text(str(plot))
        keywords_i = rake.get_ranked_phrases()
        keywords_i_string = ""
        for keyword in keywords_i:
            keywords_i_string = keywords_i_string + " " + keyword
        words.append(keywords_i_string)
    #temp_df['words'] = words
    # Ended up not needing above
    
    #Adjust weight as needed
    # Combine genre and words into a single column
    #combined_column = genre_column + words
    # This ^ ultimately ended up not being used, will leave in for legacy

    # Create a TF-IDF vectorizer
    vectorizer_words = TfidfVectorizer()
    vectorized_words = vectorizer_words.fit_transform(words)
                                                    #try temp_df['words'] if words dont work

    # Create a TF-IDF vectorizer
    vectorizer_genre = TfidfVectorizer()
    vectorized_genre = vectorizer_genre.fit_transform(genre_column)

    # Get the number of genres
    num_genre_features = vectorized_genre.shape[1]

    # Apply the weights to the genre and words vectors
    weighted_vectorized_genre = vectorized_genre.multiply(genre_weight)
    weighted_vectorized_words = vectorized_words.multiply(words_weight)

    # Combine the genre and words vectors
    vectorized_combined = hstack([weighted_vectorized_genre, weighted_vectorized_words])

    # Convert to csr_matrix
    vectorized_combined = csr_matrix(vectorized_combined)

    # Convert to array
    vectorized_combined = vectorized_combined.toarray()

    return vectorized_combined

# First Recc Model just using CountVect (conclusion and why this sucks at end)

## Preparation

In [15]:
#prepping dfs to use
df_touse = df_anime[["synopsis_processed","genre","title","popularity", "popularity_normalized"]]
synopsis = df_touse['synopsis_processed']
genre = df_touse['genre']
title = df_touse['title']

In [16]:
#extracting keywords for recommender
rake = Rake()
words = []
for plot in synopsis:
    rake.extract_keywords_from_text(str(plot))
    keywords_i = rake.get_ranked_phrases()
    keywords_i_string = ""
    for keyword in keywords_i:
        keywords_i_string = keywords_i_string + " " + keyword
    words.append(keywords_i_string)
df_touse['words'] = words

#combining to use in vectorizer
df_touse['all_words'] = df_touse['genre'] + df_touse['words']
vectorizer = CountVectorizer()
vectorized_bag_of_words = vectorizer.fit_transform(df_touse['all_words'])
vectorized_bag_of_words = vectorized_bag_of_words.toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_touse['words'] = words
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_touse['all_words'] = df_touse['genre'] + df_touse['words']


## Running recc models on CV'd words

In [17]:
recommend_reg("Fullmetal Alchemist",10, vectorized_bag_of_words, title, df_touse)

['Fullmetal Alchemist: Brotherhood',
 'Fullmetal Alchemist: The Sacred Star of Milos',
 'Fullmetal Alchemist: The Conqueror of Shamballa',
 'Fullmetal Alchemist: Brotherhood Specials',
 'Oniichan dakedo Ai sae Areba Kankeinai yo ne!',
 'Loveless',
 'Fullmetal Alchemist: The Sacred Star of Milos Specials',
 'Fairy Tail Movie 1: Houou no Miko',
 'Fullmetal Alchemist: Reflections',
 'Soukou no Strain']

In [18]:
recommend_remove_sim("Fullmetal Alchemist",10, vectorized_bag_of_words, title, df_touse)

['Oniichan dakedo Ai sae Areba Kankeinai yo ne!',
 'Oniichan Zurui',
 'Wana: Hakudaku Mamire no Houkago',
 'Loveless',
 'OZ',
 'Aniki no Yome-san nara, Ore ni Hamerarete Hiihii Itteru Tokoro Da yo',
 'Street Fighter Zero The Animation',
 'Soukou no Strain',
 'Mirai Shounen Conan 2: Taiga Daibouken',
 'Sengoku Musou']

In [19]:
recommend_all_sim("Fullmetal Alchemist",10, vectorized_bag_of_words, title, df_touse)

['Loveless',
 'Aniki no Yome-san nara, Ore ni Hamerarete Hiihii Itteru Tokoro Da yo',
 'Street Fighter Zero The Animation',
 'Oni',
 'Fushigi na Melmo',
 'Ta ga Tame no Alchemist',
 'Futari wa Nakayoshi: Goo to Sue',
 'Miboujin: Numeriau Nikuyoku to Midara ni Nureru Mitsusubo',
 'Xing Chen Bian',
 'Futago no Monchhichi']

## Conclusion

- Will consider the 3rd one the most optimal one, since it considers similar titles as one. (If you seen FMA:B, then you've prob seen all the movies associated with it)
- Will also leave the first one (recc_reg) too incase user wants similarities including similar titles

- But these recomendations overall for FMA:B suck ass. Loveless is Yaoi (hella gay)
    - nothing wrong with gay, but homosexual male romance as 'similar' to FMA:B is A WILD STRETCH.
- Seems like Loveless got recommended because the whole 'brother died or almost died' tragedy is in the synopsis, very similar to FMA, BUT THATS ABOUT IT and is considering that the most. 
- Other recomendations also seem brother related. 
    - Can safely conclude this is the result of synopsis being unweighted.



The next iterations will use a different vectorizer method instead, Tfidf. It will also weigh Genre as double and synopsis as 1. Then this whole processed will be put into a function so we can do this vectorization to feature engineered shows

# Using Tfidf + weights for synp and genre

In [20]:
# Will need 'title' 'genre' and 'synop processed' for this vectorizer

title = df_touse['title']
vectorized_weighed = vectorize_genre_and_words(df_touse['genre'], df_touse['synopsis_processed'], genre_weight=1.5, words_weight=0.9)

In [21]:
recommend_reg("Fullmetal Alchemist",10, vectorized_weighed, title, df_touse)

['Fullmetal Alchemist: Brotherhood',
 'Fullmetal Alchemist: The Sacred Star of Milos',
 'Fullmetal Alchemist: Brotherhood Specials',
 'Fullmetal Alchemist: The Conqueror of Shamballa',
 'Fairy Tail',
 'Fairy Tail (2014)',
 'Fairy Tail Movie 1: Houou no Miko',
 'Fairy Tail: Final Series',
 'Magi: The Labyrinth of Magic',
 'Magi: The Kingdom of Magic']

In [22]:
recommend_remove_sim("Fullmetal Alchemist",10, vectorized_weighed, title, df_touse)

['Tales of Vesperia: The First Strike',
 'Fairy Tail Movie 1: Houou no Miko',
 'Tide-Line Blue',
 'Katsute Kami Datta Kemono-tachi e',
 'Fairy Tail',
 'Fairy Tail x Rave',
 'Fairy Tail: Final Series',
 'Fairy Tail Movie 2: Dragon Cry',
 'Densetsu no Yuusha no Densetsu',
 'Fire Emblem']

In [23]:
recommend_all_sim("Fullmetal Alchemist",10, vectorized_weighed, title, df_touse)

['Tales of Vesperia: The First Strike',
 'Katsute Kami Datta Kemono-tachi e',
 'Fairy Tail',
 'Fairy Tail: Final Series',
 'Densetsu no Yuusha no Densetsu',
 'Fire Emblem',
 'Dragon Quest: Dai no Daibouken Buchiyabure!! Shinsei 6 Daishougun',
 'Slayers Great',
 'One Piece Film: Strong World',
 'Slayers: The Motion Picture']

## Conclusion:
- Seems like using tfidf and weighing genre more has helped.

# Feature Engineered Df

## Creating engineered df 1

In [24]:
# Creating top 1000 in popularity
df_anime_sorted = df_anime.sort_values('popularity', ascending = True)

df_anime_sorted.head(5)
df_top_1000 = df_anime_sorted.head(1000)

# Below will be process for feature engineered df's.
# Will need 'title' 'genre' and 'synop processed' for this vectorizer

title_t1000 = df_top_1000['title']
vectorized_weighed_t1000 = vectorize_genre_and_words(df_top_1000['genre'], df_top_1000['synopsis_processed'], genre_weight=1.5, words_weight=0.9)

## Running recc models of featured engineerd 1

In [25]:
recommend_reg("Fullmetal Alchemist",10, vectorized_weighed_t1000, title_t1000, df_top_1000)

['Fullmetal Alchemist: Brotherhood',
 'Fairy Tail',
 'Fairy Tail (2014)',
 'Magi: The Labyrinth of Magic',
 'Akame ga Kill!',
 'Magi: The Kingdom of Magic',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri',
 'Nanatsu no Taizai',
 'Nanatsu no Taizai: Imashime no Fukkatsu',
 'Black Clover']

In [26]:
recommend_remove_sim("Fullmetal Alchemist",10, vectorized_weighed_t1000, title_t1000, df_top_1000)

['Fairy Tail Movie 1: Houou no Miko',
 'Fairy Tail',
 'Fairy Tail: Final Series',
 'Fairy Tail (2014)',
 'Densetsu no Yuusha no Densetsu',
 'Magi: Sinbad no Bouken (TV)',
 'Magi: The Labyrinth of Magic',
 'Magi: The Kingdom of Magic',
 'Nejimaki Seirei Senki: Tenkyou no Alderamin',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri']

In [27]:
recommend_all_sim("Fullmetal Alchemist",10, vectorized_weighed_t1000, title_t1000, df_top_1000)

['Fairy Tail',
 'Fairy Tail: Final Series',
 'Densetsu no Yuusha no Densetsu',
 'Nejimaki Seirei Senki: Tenkyou no Alderamin',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri 2nd Season',
 'One Piece Film: Strong World',
 'Black Clover',
 'Akame ga Kill!',
 'Tales of Zestiria the Cross']

## Feat Engineered 2: New df imported from EDA_3
- This df has removed most shows which are 2 or less episodes. These are movies and do not need to show up in the recc for shows.

In [28]:
df_anime_nomov = pd.read_csv("../data/df_anime_nomovie.csv")

In [33]:
df_anime_nomov.dropna(inplace = True)

In [34]:
#Process before running rec:
nomovie_title = df_anime_nomov['title']
vectorized_no_movie = vectorize_genre_and_words(df_anime_nomov['genre'], df_anime_nomov['synopsis_processed'], genre_weight=1.5, words_weight=0.9)

## Running recc on feat engineered 2

In [35]:
recommend_reg("Fullmetal Alchemist",10, vectorized_no_movie, nomovie_title, df_anime_nomov)

['Fullmetal Alchemist: Brotherhood',
 'Fullmetal Alchemist: The Sacred Star of Milos',
 'Fullmetal Alchemist: Brotherhood Specials',
 'Fullmetal Alchemist: The Conqueror of Shamballa',
 'Fairy Tail',
 'Fairy Tail (2014)',
 'Fairy Tail Movie 1: Houou no Miko',
 'Fairy Tail: Final Series',
 'Magi: The Labyrinth of Magic',
 'Magi: The Kingdom of Magic']

In [36]:
recommend_remove_sim("Fullmetal Alchemist",10, vectorized_no_movie, nomovie_title, df_anime_nomov)

['Tide-Line Blue',
 'Katsute Kami Datta Kemono-tachi e',
 'Fairy Tail Movie 1: Houou no Miko',
 'Fairy Tail',
 'Fairy Tail: Final Series',
 'Fairy Tail (2014)',
 'Densetsu no Yuusha no Densetsu',
 'Magi: Sinbad no Bouken (TV)',
 'Magi: Sinbad no Bouken',
 'Magi: The Labyrinth of Magic']

In [40]:
recommend_all_sim("Fullmetal Alchemist",10, vectorized_no_movie, nomovie_title, df_anime_nomov)

['Katsute Kami Datta Kemono-tachi e',
 'Fairy Tail',
 'Fairy Tail: Final Series',
 'Densetsu no Yuusha no Densetsu',
 'Chain Chronicle: Short Animation',
 'One Piece Film: Strong World',
 'Nejimaki Seirei Senki: Tenkyou no Alderamin',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri',
 'Gate: Jieitai Kanochi nite, Kaku Tatakaeri 2nd Season',
 'Black Clover']

## Conclusion so far:
- I believe this feat engineered is the best I can get.

- The final model to push to streamlit will be:
    - df: df_anime_nomovie.csv
    - rec model: recommend_all_sim
        - will also put in reg incase user wants to see recommended within the same series
    