In [1]:
pip install pandas matplotlib seaborn scikit-learn numpy fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [2]:
# All imports go here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [3]:
animes_dataset = pd.read_csv("/kaggle/input/animelist/AnimeList.csv")  #my input file 
print(animes_dataset.columns)
num_rows = animes_dataset.shape[0]
print(num_rows)
animes_dataset["type"]
animes_dataset[['title', 'genre']]

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')
14478


Unnamed: 0,title,genre
0,Inu x Boku SS,"Comedy, Supernatural, Romance, Shounen"
1,Seto no Hanayome,"Comedy, Parody, Romance, School, Shounen"
2,Shugo Chara!! Doki,"Comedy, Magic, School, Shoujo"
3,Princess Tutu,"Comedy, Drama, Magic, Romance, Fantasy"
4,Bakuman. 3rd Season,"Comedy, Drama, Romance, Shounen"
...,...,...
14473,Gutchonpa Omoshiro Hanashi,Kids
14474,Geba Geba Shou Time!,Comedy
14475,Godzilla: Hoshi wo Kuu Mono,"Action, Sci-Fi, Adventure, Fantasy"
14476,Nippon Mukashibanashi: Sannen Netarou,"Fantasy, Kids"


In [4]:
# exploring data set 

#Step 1:  remove unecessary columns :  image_url, broadcast, licensor, background, favorites, premiered, producer
animes_dataset_clean = animes_dataset.drop(columns=['image_url', 'broadcast','licensor','background','favorites','premiered','producer'])

missing_values = animes_dataset_clean.isnull().sum()

# Drop title_english - since we cannot do anything about it
# Drop title_synonyms - we cannot do anything about it
# rating and rank misisng values can be filled after clustering?
animes_dataset_clean = animes_dataset_clean.drop(columns=['title_english','title_synonyms'])
animes_dataset_clean = animes_dataset_clean[animes_dataset_clean['genre'].notna()]
animes_dataset_clean['genre'] = animes_dataset_clean['genre'].fillna('generic')  # Handle missing values


animes_dataset_clean['genres'] = animes_dataset_clean['genre'].str.split(', ')
all_genres = set()
animes_dataset_clean['genres'].apply(all_genres.update)

genre_data_frame = pd.DataFrame()

# Step 4: Add a binary column for each genre
for genre in all_genres:
    genre_data_frame[genre] = animes_dataset_clean['genres'].apply(lambda x: 1 if genre in x else 0)

genre_data_frame['anime_id'] = animes_dataset_clean['anime_id']
genre_data_frame['title'] = animes_dataset_clean['title']
genre_data_frame = genre_data_frame[['anime_id', 'title'] + list(all_genres)]

# Display the genre_data_frame
pd.set_option('display.max_columns', None)  # This ensures all columns are displayed

display(genre_data_frame)


Unnamed: 0,anime_id,title,Magic,Romance,Sci-Fi,Sports,Psychological,Yuri,Comedy,Shoujo Ai,Slice of Life,Fantasy,Shounen Ai,Hentai,Cars,Josei,Supernatural,Horror,Demons,Yaoi,Police,Shounen,Shoujo,Vampire,Ecchi,Military,Adventure,Seinen,Historical,Space,Game,Mystery,Dementia,Drama,Harem,Samurai,Mecha,Music,Kids,Martial Arts,Thriller,Parody,Super Power,School,Action
0,11013,Inu x Boku SS,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2104,Seto no Hanayome,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,5262,Shugo Chara!! Doki,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,721,Princess Tutu,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12365,Bakuman. 3rd Season,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14473,26089,Gutchonpa Omoshiro Hanashi,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
14474,21525,Geba Geba Shou Time!,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14475,37897,Godzilla: Hoshi wo Kuu Mono,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
14476,34193,Nippon Mukashibanashi: Sannen Netarou,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [5]:
def recommend_animes_Cosine(genre_df, input_data, top_n):

    # Extract only genre columns (excluding anime_id and title)
    genre_columns = genre_df.columns.difference(['anime_id', 'title'])

    # Compute cosine similarity between input and dataset
    similarity_scores = cosine_similarity(input_data[genre_columns], genre_df[genre_columns])

    # Convert to DataFrame for easy lookup
    similarity_df = pd.DataFrame(similarity_scores, columns=genre_df['title'])

    # Get top N recommendations (sorted by similarity score)
    recommendations = similarity_df.T.sort_values(by=0, ascending=False).reset_index()
    recommendations.columns = ['Anime Title', 'Similarity Score']

    return recommendations.head(top_n)


In [6]:
def recommend_animes_cosine(genre_df, anime_scores_df, input_data, top_n, weight_similarity=0.2, weight_score=0.8,weight_scoredby= 0):

    
    # Ensure genre_df and anime_scores_df have the same anime_id and merge the two DataFrames
    combined_df = genre_df.merge(anime_scores_df[['anime_id', 'score','scored_by']], on='anime_id', how='left')
    
    # Extract only genre columns (excluding anime_id, title, and score)
    genre_columns = genre_df.columns.difference(['anime_id', 'title', 'score','scored_by'])

    # Compute cosine similarity between input and dataset
    similarity_scores = cosine_similarity(input_data[genre_columns], combined_df[genre_columns])

    # Convert to DataFrame for easy lookup
    similarity_df = pd.DataFrame(similarity_scores[0], index=combined_df['anime_id'], columns=['Similarity'])

    # Merge similarity scores with the combined dataset (including the 'Score' column)
    combined_df = combined_df.merge(similarity_df, on='anime_id')

    # Normalize scores (0 to 1 scale)
    scaler = MinMaxScaler()
    combined_df[['score', 'Similarity','scored_by']] = scaler.fit_transform(combined_df[['score', 'Similarity','scored_by']])

    # Compute final ranking score (weighted)
    combined_df['Final Score'] = (combined_df['Similarity'] * weight_similarity) + (combined_df['score'] * weight_score) + (combined_df['scored_by'] * weight_scoredby)

    # Get top N recommendations
    recommendations = combined_df.sort_values(by='Final Score', ascending=False)[['anime_id', 'title', 'Final Score']].head(top_n)

    return recommendations

In [7]:
def get_closest_genres(input_genres, genre_columns, threshold=80):
    matched_genres = []
    
    for input_genre in input_genres:
        # Clean input by removing extra spaces and making it lowercase
        cleaned_input_genre = ' '.join(input_genre.split()).lower()

        # Use different fuzzy matching methods and choose the best match
        best_match = None
        highest_score = 0
        
        for genre in genre_columns:
            score = fuzz.partial_ratio(cleaned_input_genre, genre.lower())
            if score > highest_score:
                highest_score = score
                best_match = genre

        # Add the best match to the list if score is above the threshold
        if highest_score >= threshold:
            matched_genres.append(best_match)
        else:
            matched_genres.append('No match')
    
    return matched_genres

In [8]:
input_genres = ['Shounen','Adventure','Action']
input_genres = get_closest_genres(input_genres,all_genres)
print(input_genres)
input_genres_transformed = pd.DataFrame([{genre: 1 if genre in input_genres else 0 for genre in all_genres}])
# display(input_genres_transformed)
display(recommend_animes_cosine(genre_data_frame, animes_dataset_clean, input_genres_transformed,5))

['Shounen Ai', 'Adventure', 'Action']


Unnamed: 0,anime_id,title,Final Score
416,11061,Hunter x Hunter (2011),0.870221
2059,164,Mononoke Hime,0.867299
2545,5114,Fullmetal Alchemist: Brotherhood,0.84
8269,33383,Mask Masters,0.84
12089,4087,Michiko to Hatchin,0.8304


In [9]:
def merge_datasets(genre_df, anime_scores_df):
    return genre_df.merge(anime_scores_df[['anime_id', 'score', 'scored_by']], on='anime_id', how='left')

def get_genre_columns(genre_df):
    return genre_df.columns.difference(['anime_id', 'title', 'score', 'scored_by'])

def compute_similarity(input_anime, combined_df, genre_columns):
    similarity_scores = cosine_similarity(input_anime, combined_df[genre_columns])
    return pd.DataFrame(similarity_scores[0], index=combined_df['anime_id'], columns=['Similarity'])

def normalize_features(combined_df):
    scaler = MinMaxScaler()
    combined_df[['score', 'Similarity', 'scored_by']] = scaler.fit_transform(combined_df[['score', 'Similarity', 'scored_by']])
    return combined_df

def compute_final_score(combined_df, weight_similarity=0.7, weight_score=0.2, weight_scoredby=0):
    combined_df['Final Score'] = (
        combined_df['Similarity'] * weight_similarity + 
        combined_df['score'] * weight_score + 
        combined_df['scored_by'] * weight_scoredby
    )
    return combined_df

In [10]:

def find_closest_title(anime_title, combined_df):    
    titles = combined_df['title'].tolist()
    
    if 'title_english' in combined_df.columns:
        titles += combined_df['title_english'].dropna().tolist()
    
    if 'title_synonyms' in combined_df.columns:
        titles += combined_df['title_synonyms'].dropna().tolist()
    
    if 'title_japanese' in combined_df.columns:
        titles += combined_df['title_japanese'].dropna().tolist()
    
    titles = list(set(titles))
    
    match, score = process.extractOne(anime_title, titles)
    
    if score > 85:  
        return match
    return None


In [11]:
def recommend_anime_by_title(genre_df, anime_scores_df, anime_title, top_n=5, weight_similarity=0.2, weight_score=0.8, weight_scoredby=0):
    
    # Merge datasets
    combined_df = merge_datasets(genre_df, anime_scores_df)

    # Check if the anime exists
    # Check if the anime exists, if not, find the closest match
    if anime_title not in combined_df['title'].values:
        closest_match = find_closest_title(anime_title, combined_df)
        if closest_match:
            print(f"Anime '{anime_title}' not found. Did you mean '{closest_match}'?")
            anime_title = closest_match
        else:
            return f"Anime '{anime_title}' not found in dataset. No close matches found."


    # Get genre columns and extract input anime's genre vector
    genre_columns = get_genre_columns(genre_df)
    input_anime = combined_df[combined_df['title'] == anime_title][genre_columns]

    # Compute similarity
    similarity_df = compute_similarity(input_anime, combined_df, genre_columns)

    # Merge similarity scores with dataset
    combined_df = combined_df.merge(similarity_df, on='anime_id')

    # Normalize features
    combined_df = normalize_features(combined_df)

    # Compute final scores
    combined_df = compute_final_score(combined_df, weight_similarity, weight_score, weight_scoredby)

    # Get top N recommendations
    recommendations = combined_df[combined_df['title'] != anime_title] \
        .sort_values(by='Final Score', ascending=False)[['anime_id', 'title', 'Final Score']] \
        .head(top_n)

    return recommendations

In [12]:
recommend_anime_by_title(genre_data_frame, animes_dataset_clean, "Dragon Ball", top_n=5)

Unnamed: 0,anime_id,title,Final Score
2545,5114,Fullmetal Alchemist: Brotherhood,0.85547
416,11061,Hunter x Hunter (2011),0.851274
914,813,Dragon Ball Z,0.849964
34,21,One Piece,0.837503
17,1735,Naruto: Shippuuden,0.821067
