# **Recommandetion System**

# **Data Preprocessing**

In [160]:
# Load the dataset into a suitable data structure (e.g., pandas DataFrame).
import pandas as pd

# Load the dataset into a pandas DataFrame
anime_data = pd.read_csv('anime.csv')

# Show the first few rows of the dataset
anime_data.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [161]:
anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [162]:
# Check for missing values
anime_data.isnull().mean()

# Drop rows with missing ratings or essential columns (if any)
anime_data.dropna(subset=['rating', 'genre'], inplace=True)


In [163]:
anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [164]:
# Explore basic info and types of columns
anime_data.info()



<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [165]:
# Check for unique genres or any anomalies
anime_data['genre'].unique()


array(['Drama, Romance, School, Supernatural',
       'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen',
       'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen',
       ..., 'Action, Comedy, Hentai, Romance, Supernatural',
       'Hentai, Sports', 'Hentai, Slice of Life'], dtype=object)

In [166]:
anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175



# **Feature Extraction**

In [167]:
from sklearn.metrics.pairwise import cosine_similarity

# Creating a matrix where each row is an anime and each column is the rating (since we have only one attribute, the matrix will be 1-dimensional in features)
ratings_matrix = anime_data[['rating']]

# Compute the cosine similarity matrix from the ratings matrix
similarity_matrix = cosine_similarity(ratings_matrix)

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=anime_data['name'], columns=anime_data['name'])


# **Recommendation System**

In [168]:
def get_similar_anime(anime_name, similarity_data, top_n=10):
    if anime_name not in similarity_data.index:
        return "Anime not found in the dataset."

    # Get similarity scores for the given anime with all others
    similar_scores = similarity_data.loc[anime_name]

    # Sort the scores in descending order
    similar_scores = similar_scores.sort_values(ascending=False)

    # Return the top n most similar anime
    return similar_scores.head(top_n+1)[1:]  # plus one because the first entry will be the anime itself with a similarity of 1


In [169]:
#only with rating not much information
recommended_animes_fma = get_similar_anime('Toushindai My Lover: Minami tai Mecha-Minami', similarity_df)
print(recommended_animes_fma)


name
Pop                                 1.0
The Embryo Develops into a Fetus    1.0
The Wash Bird of the Wash Island    1.0
Dead Girl Trailer                   1.0
Born by Myself                      1.0
Pinky                               1.0
Micro Teukgongdae Diatron 5         1.0
G-senjou no Higeki                  1.0
CCW: Crazy Clay Wrestling           1.0
Within the Bloody Woods             1.0
Name: Toushindai My Lover: Minami tai Mecha-Minami, dtype: float64


In [170]:
#only with rating not much information
recommended_animes_fma = get_similar_anime('Naruto', similarity_df)
print(recommended_animes_fma)


name
Pop                                 1.0
The Embryo Develops into a Fetus    1.0
The Wash Bird of the Wash Island    1.0
Dead Girl Trailer                   1.0
Born by Myself                      1.0
Pinky                               1.0
Micro Teukgongdae Diatron 5         1.0
G-senjou no Higeki                  1.0
CCW: Crazy Clay Wrestling           1.0
Within the Bloody Woods             1.0
Name: Naruto, dtype: float64


In [171]:
#only with rating not much information
recommended_animes_fma = get_similar_anime('shar', similarity_df)
print(recommended_animes_fma)


Anime not found in the dataset.


In [172]:
from sklearn.model_selection import train_test_split

# Example train-test split (if necessary for evaluation)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [173]:
#use genre also
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Assuming 'genre' column is cleaned and ready to use
tfidf = TfidfVectorizer(stop_words='english')
anime_data['genre'] = anime_data['genre'].fillna('')  # Fill missing values with empty string
tfidf_matrix = tfidf.fit_transform(anime_data['genre'])

# Compute the cosine similarity matrix from the TF-IDF vectors
genre_similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

# Convert to DataFrame for better handling
genre_similarity_df = pd.DataFrame(genre_similarity_matrix, index=anime_data['name'], columns=anime_data['name'])

# Use this new genre-based similarity along with the original ratings-based similarity
combined_similarity = (similarity_df + genre_similarity_df) / 2

# Recommendation function can now use this combined similarity
recommended_animes_fma = get_similar_anime('Fullmetal Alchemist: Brotherhood', combined_similarity)
print(recommended_animes_fma)


name
Fullmetal Alchemist                                                  0.986747
Fullmetal Alchemist: The Sacred Star of Milos                        0.986747
Fullmetal Alchemist: Brotherhood Specials                            0.977751
Tales of Vesperia: The First Strike                                  0.937341
Tide-Line Blue                                                       0.916445
Fullmetal Alchemist: Reflections                                     0.905581
Magi: The Kingdom of Magic                                           0.892404
Log Horizon Recap                                                    0.892404
Dragon Quest: Dai no Daibouken Buchiyabure!! Shinsei 6 Daishougun    0.892404
Magi: The Labyrinth of Magic                                         0.892404
Name: Fullmetal Alchemist: Brotherhood, dtype: float64


# **Evaluation**

In [174]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(anime_data, test_size=0.2, random_state=42)

# Create a set of anime names from the test set
test_anime_titles = set(test_df['name'])


In [175]:
# Feature extraction for the training set (similar to what you've done before)
ratings_matrix_train = train_df[['rating']]
similarity_matrix_train = cosine_similarity(ratings_matrix_train)

# Convert similarity matrix to DataFrame
similarity_df_train = pd.DataFrame(similarity_matrix_train, index=train_df['name'], columns=train_df['name'])

# For genre similarity (using TF-IDF)
tfidf = TfidfVectorizer(stop_words='english')
train_df['genre'] = train_df['genre'].fillna('')  # Fill missing values with empty string
tfidf_matrix_train = tfidf.fit_transform(train_df['genre'])

# Compute the cosine similarity matrix from the TF-IDF vectors for genres
genre_similarity_matrix_train = linear_kernel(tfidf_matrix_train, tfidf_matrix_train)

# Convert to DataFrame for easier handling
genre_similarity_df_train = pd.DataFrame(genre_similarity_matrix_train, index=train_df['name'], columns=train_df['name'])

# Combine the ratings similarity and genre similarity
combined_similarity_train = (similarity_df_train + genre_similarity_df_train) / 2


In [176]:
def get_similar_anime_for_test(anime_name, similarity_data, top_n=10):
    """Get top N similar anime for a given anime in the test set."""
    if anime_name not in similarity_data.index:
        return f"Anime '{anime_name}' not found in the dataset."

    # Get similarity scores for the given anime with all others
    similar_scores = similarity_data.loc[anime_name]

    # Sort the scores in descending order
    similar_scores = similar_scores.sort_values(ascending=False)

    # Exclude the anime itself (first entry is always 1.0 similarity with itself)
    similar_scores = similar_scores[1:]

    # Return the top N most similar anime (excluding the original anime itself)
    return similar_scores.head(top_n)

# Example: Get similar anime for a test anime from the test set
test_anime_name = test_df['name'].iloc[0]  # Pick the first anime from the test set as an example
recommended_animes_test = get_similar_anime_for_test(test_anime_name, combined_similarity_train, top_n=3)
print(f"Recommended anime for '{test_anime_name}':\n{recommended_animes_test}")


Recommended anime for 'Koutetsu Tenshi Kurumi Zero':
Anime 'Koutetsu Tenshi Kurumi Zero' not found in the dataset.


In [177]:
# Check cosine similarity for a specific anime
test_anime_name = 'Fullmetal Alchemist: Brotherhood'  # Replace with a valid anime name from your dataset
print(f"Similarity scores for '{test_anime_name}':")
print(combined_similarity_train[test_anime_name].sort_values(ascending=False).head(10))


Similarity scores for 'Fullmetal Alchemist: Brotherhood':
name
Fullmetal Alchemist: Brotherhood                                     1.000000
Fullmetal Alchemist: The Sacred Star of Milos                        0.986628
Fullmetal Alchemist: Brotherhood Specials                            0.977723
Tales of Vesperia: The First Strike                                  0.937139
Fullmetal Alchemist: Reflections                                     0.905800
Log Horizon Recap                                                    0.893328
Dragon Quest: Dai no Daibouken Buchiyabure!! Shinsei 6 Daishougun    0.893328
Meoteoldosawa Ttomae                                                 0.893328
Magi: The Labyrinth of Magic                                         0.893328
Chain Chronicle: Haecceitas no Hikari Part 1                         0.893328
Name: Fullmetal Alchemist: Brotherhood, dtype: float64


In [178]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Function to calculate precision, recall, and F1 score
def evaluate_recommendation_system(test_df, similarity_data, top_n=10):
    precisions = []
    recalls = []
    f1_scores = []

    # Iterate through each anime in the test set
    for test_title in test_df['name']:
        # Get top N recommended anime for the test anime from the training set
        recommended_titles = get_similar_anime_for_test(test_title, similarity_data, top_n=top_n)

        # Check if recommended_titles is a string (anime not found)
        if isinstance(recommended_titles, str):
            # Handle the case where the anime is not found
            recommended_set = set() # or any other appropriate handling
        else:
            # Convert recommended titles to a set
            recommended_set = set(recommended_titles.index)

        # Treat the relevant anime as those that appear in the test set
        relevant_set = set([test_title])

        # Calculate Precision: How many of the recommended anime are relevant
        precision = len(recommended_set.intersection(relevant_set)) / len(recommended_set) if len(recommended_set) > 0 else 0

        # Calculate Recall: How many relevant anime are recommended
        recall = len(recommended_set.intersection(relevant_set)) / len(relevant_set) if len(relevant_set) > 0 else 0

        # Calculate F1 Score
        if (precision + recall) > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0

        # Append the results for each test anime
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    # Calculate average metrics
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1_scores)

    return avg_precision, avg_recall, avg_f1

# Evaluate the system
avg_precision, avg_recall, avg_f1 = evaluate_recommendation_system(test_df, combined_similarity_train, top_n=3)
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")

Average Precision: 0.0000
Average Recall: 0.0000
Average F1-Score: 0.0000


# **Interview Questions**

## **1. Can you explain the difference between user-based and item-based collaborative filtering?**

**User-based collaborative filtering:** This method recommends items based on the preferences of similar users. It identifies users who have similar tastes and recommends items that those similar users have liked.

**Item-based collaborative filtering:** This method recommends items that are similar to items a user has liked or interacted with. It identifies similarities between items based on user behavior, and recommends items similar to those the user has already shown interest in.

## **2. What is collaborative filtering, and how does it work?**

**Collaborative filtering** is a technique used in recommendation systems to predict a user's interests by collecting preferences from many users. The key idea is that if users agree on one issue, they are likely to agree on others. It works by analyzing either user-user or item-item interactions, usually based on a user-item rating matrix.

**User-based:** It finds users who are similar to the target user and recommends items based on what those similar users have rated highly.

**Item-based:** It finds items similar to those the user has liked and recommends those items.