In [262]:
pip install pandas matplotlib seaborn scikit-learn numpy fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [263]:
# All imports go here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [264]:
animes_dataset = pd.read_csv("AnimeList.csv")  #my input file 
print(animes_dataset.columns)
num_rows = animes_dataset.shape[0]
print(num_rows)
animes_dataset["type"]
animes_dataset[['title', 'genre']]

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')
14478


Unnamed: 0,title,genre
0,Inu x Boku SS,"Comedy, Supernatural, Romance, Shounen"
1,Seto no Hanayome,"Comedy, Parody, Romance, School, Shounen"
2,Shugo Chara!! Doki,"Comedy, Magic, School, Shoujo"
3,Princess Tutu,"Comedy, Drama, Magic, Romance, Fantasy"
4,Bakuman. 3rd Season,"Comedy, Drama, Romance, Shounen"
...,...,...
14473,Gutchonpa Omoshiro Hanashi,Kids
14474,Geba Geba Shou Time!,Comedy
14475,Godzilla: Hoshi wo Kuu Mono,"Action, Sci-Fi, Adventure, Fantasy"
14476,Nippon Mukashibanashi: Sannen Netarou,"Fantasy, Kids"


In [265]:
# exploring data set 

#Step 1:  remove unecessary columns :  image_url, broadcast, licensor, background, favorites, premiered, producer
animes_dataset_clean = animes_dataset.drop(columns=['image_url', 'broadcast','licensor','background','favorites','premiered','producer'])

missing_values = animes_dataset_clean.isnull().sum()

# Drop title_english - since we cannot do anything about it
# Drop title_synonyms - we cannot do anything about it
# rating and rank misisng values can be filled after clustering?
animes_dataset_clean = animes_dataset_clean.drop(columns=['title_english','title_synonyms'])
animes_dataset_clean = animes_dataset_clean[animes_dataset_clean['genre'].notna()]
animes_dataset_clean['genre'] = animes_dataset_clean['genre'].fillna('generic')  # Handle missing values


animes_dataset_clean['genres'] = animes_dataset_clean['genre'].str.split(', ')
all_genres = set()
animes_dataset_clean['genres'].apply(all_genres.update)

genre_data_frame = pd.DataFrame()

# Step 4: Add a binary column for each genre
for genre in all_genres:
    genre_data_frame[genre] = animes_dataset_clean['genres'].apply(lambda x: 1 if genre in x else 0)

genre_data_frame['anime_id'] = animes_dataset_clean['anime_id']
genre_data_frame['title'] = animes_dataset_clean['title']
genre_data_frame = genre_data_frame[['anime_id', 'title'] + list(all_genres)]

# Display the genre_data_frame
pd.set_option('display.max_columns', None)  # This ensures all columns are displayed

display(genre_data_frame)


Unnamed: 0,anime_id,title,Samurai,Thriller,Seinen,Game,Kids,Yuri,Action,Cars,Historical,Adventure,Magic,Mecha,Mystery,Yaoi,Shoujo,Josei,Shoujo Ai,School,Ecchi,Space,Drama,Romance,Demons,Supernatural,Slice of Life,Sports,Harem,Comedy,Parody,Psychological,Vampire,Military,Horror,Fantasy,Police,Super Power,Shounen,Hentai,Music,Sci-Fi,Shounen Ai,Martial Arts,Dementia
0,11013,Inu x Boku SS,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,2104,Seto no Hanayome,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,5262,Shugo Chara!! Doki,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,721,Princess Tutu,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,12365,Bakuman. 3rd Season,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14473,26089,Gutchonpa Omoshiro Hanashi,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14474,21525,Geba Geba Shou Time!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14475,37897,Godzilla: Hoshi wo Kuu Mono,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
14476,34193,Nippon Mukashibanashi: Sannen Netarou,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [266]:
def recommend_animes_Cosine(genre_df, input_data, top_n):

    # Extract only genre columns (excluding anime_id and title)
    genre_columns = genre_df.columns.difference(['anime_id', 'title'])

    # Compute cosine similarity between input and dataset
    similarity_scores = cosine_similarity(input_data[genre_columns], genre_df[genre_columns])

    # Convert to DataFrame for easy lookup
    similarity_df = pd.DataFrame(similarity_scores, columns=genre_df['title'])

    # Get top N recommendations (sorted by similarity score)
    recommendations = similarity_df.T.sort_values(by=0, ascending=False).reset_index()
    recommendations.columns = ['Anime Title', 'Similarity Score']

    return recommendations.head(top_n)


In [267]:
def recommend_animes(genre_df, anime_scores_df, input_data, top_n, weight_similarity=0.2, weight_score=0.8):

    
    # Ensure genre_df and anime_scores_df have the same anime_id and merge the two DataFrames
    combined_df = genre_df.merge(anime_scores_df[['anime_id', 'score']], on='anime_id', how='left')
    
    # Extract only genre columns (excluding anime_id, title, and score)
    genre_columns = genre_df.columns.difference(['anime_id', 'title', 'score'])

    # Compute cosine similarity between input and dataset
    similarity_scores = cosine_similarity(input_data[genre_columns], combined_df[genre_columns])

    # Convert to DataFrame for easy lookup
    similarity_df = pd.DataFrame(similarity_scores[0], index=combined_df['anime_id'], columns=['Similarity'])

    # Merge similarity scores with the combined dataset (including the 'Score' column)
    combined_df = combined_df.merge(similarity_df, on='anime_id')

    # Normalize scores (0 to 1 scale)
    scaler = MinMaxScaler()
    combined_df[['score', 'Similarity']] = scaler.fit_transform(combined_df[['score', 'Similarity']])

    # Compute final ranking score (weighted)
    combined_df['Final Score'] = (combined_df['Similarity'] * weight_similarity) + (combined_df['score'] * weight_score)

    # Get top N recommendations
    recommendations = combined_df.sort_values(by='Final Score', ascending=False)[['anime_id', 'title', 'Final Score']].head(top_n)

    return recommendations

In [268]:
# def get_closest_genres(input_genres, genre_columns, threshold=80):
#     matched_genres = []
    
#     for input_genre in input_genres:
#         # Perform fuzzy matching for each input genre
#         best_match, score = process.extractOne(input_genre, genre_columns, scorer=fuzz.token_sort_ratio)

#         # Add the best match to the list if score is above the threshold
#         if score >= threshold:
#             matched_genres.append(best_match)
#         else:
#             matched_genres.append('No match')
    
#     return matched_genres

In [269]:
input_genres = ['Romance','School','Comedy']
input_genres_transformed = pd.DataFrame([{genre: 1 if genre in input_genres else 0 for genre in all_genres}])
display(input_genres_transformed)
print(recommend_animes(genre_data_frame, animes_dataset_clean, input_genres_transformed,10))


Unnamed: 0,Samurai,Thriller,Seinen,Game,Kids,Yuri,Action,Cars,Historical,Adventure,Magic,Mecha,Mystery,Yaoi,Shoujo,Josei,Shoujo Ai,School,Ecchi,Space,Drama,Romance,Demons,Supernatural,Slice of Life,Sports,Harem,Comedy,Parody,Psychological,Vampire,Military,Horror,Fantasy,Police,Super Power,Shounen,Hentai,Music,Sci-Fi,Shounen Ai,Martial Arts,Dementia
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


       anime_id                                              title  \
8097      33161  Yahari Ore no Seishun Love Comedy wa Machigatt...   
3171      23289                           Gekkan Shoujo Nozaki-kun   
1025      32281                                     Kimi no Na wa.   
19         4224                                          Toradora!   
1091       7311                      Suzumiya Haruhi no Shoushitsu   
1077       5941                                         Cross Game   
1984      27633                  Gekkan Shoujo Nozaki-kun Specials   
10990     34822                                     Tsuki ga Kirei   
1453      34537                    Yoru wa Mijikashi Arukeyo Otome   
11         7054                              Kaichou wa Maid-sama!   

       Final Score  
8097      0.855200  
3171      0.852000  
1025      0.850670  
19        0.844405  
1091      0.839821  
1077      0.834919  
1984      0.832800  
10990     0.831299  
1453      0.830499  
11        0.829205 