In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [208]:
data = pd.read_csv('anime.csv')

In [209]:
data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [210]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [211]:
data.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

### Handeling the missing values

In [212]:
data['genre'].value_counts()

genre
Hentai                                                  823
Comedy                                                  523
Music                                                   301
Kids                                                    199
Comedy, Slice of Life                                   179
                                                       ... 
Hentai, Parody, Sci-Fi                                    1
Hentai, Historical, Mystery                               1
Action, Comedy, Hentai                                    1
Comedy, Ecchi, Fantasy, Parody, Yuri                      1
Action, Mecha, Military, School, Sci-Fi, Super Power      1
Name: count, Length: 3264, dtype: int64

In [213]:
data['type'].value_counts()

type
TV         3787
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Name: count, dtype: int64

In [214]:
data['genre'].mode()

0    Hentai
Name: genre, dtype: object

In [215]:
data['type'].mode()

0    TV
Name: type, dtype: object

- we will replace the null values of genre and type by mode of the respective featues

In [216]:
data['genre'].fillna(value= data['genre'].mode()[0],inplace=True)
data['type'].fillna(value=data['type'].mode()[0], inplace= True)

In [217]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [218]:
# Fill missing values with median rating based on type and genre
data['rating'] = data.groupby(['type', 'genre'])['rating'].transform(lambda x: x.fillna(x.median()))

In [219]:
data.isnull().sum()

anime_id     0
name         0
genre        0
type         0
episodes     0
rating      67
members      0
dtype: int64

In [220]:
data['rating'] = data.groupby('type')['rating'].transform(lambda x: x.fillna(x.median()))

In [221]:
data.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

### Feature Extraction

In [222]:
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

def preprocess_genre(data):
    """
    Transforms the 'genre' column into a Multi-Hot encoded format.
    Each unique genre will have a separate binary column, where '1' indicates
    the presence of the genre and '0' indicates its absence for each anime.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing anime data with a 'genre' column.

    Returns:
    - pd.DataFrame: The DataFrame with the 'genre' column replaced by multi-hot encoded columns.
    """
    genre_encoder = MultiLabelBinarizer()
    data['genre'] = data['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
    genre_encoded = pd.DataFrame(genre_encoder.fit_transform(data['genre']), columns=genre_encoder.classes_)
    data = pd.concat([data, genre_encoded], axis=1)
    data.drop(columns=['genre'], inplace=True)
    return data

In [223]:
def preprocess_type(data):
    """
    One-hot encodes the 'type' column, transforming categorical values into binary columns.
    Each possible type will have its own column with a '1' indicating the presence
    of that type and '0' indicating its absence for each anime.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing anime data with a 'type' column.

    Returns:
    - pd.DataFrame: The DataFrame with the 'type' column replaced by one-hot encoded columns.
    """
    type_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    type_encoded = pd.DataFrame(type_encoder.fit_transform(data[['type']]), 
                                 columns=type_encoder.get_feature_names_out(['type']))
    data = pd.concat([data, type_encoded], axis=1)
    data.drop(columns=['type'], inplace=True)
    return data

In [224]:
def preprocess_episodes(data):
    """
    Converts the 'episodes' column to a numeric format and fills any missing values
    with the median value of the 'episodes' column.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing anime data with an 'episodes' column.

    Returns:
    - pd.DataFrame: The DataFrame with the 'episodes' column converted to numeric values
      and missing values replaced by the median.
    """
    data['episodes'] = pd.to_numeric(data['episodes'], errors='coerce')
    data['episodes'].fillna(data['episodes'].median(), inplace=True)
    return data

In [225]:
def normalize_features(data):
    """
    Normalizes the 'rating' and 'members' columns to a range between 0 and 1 using Min-Max scaling.
    This helps to bring the features to a comparable scale, improving the performance of machine learning models.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing anime data with 'rating' and 'members' columns.

    Returns:
    - pd.DataFrame: The DataFrame with the 'rating' and 'members' columns scaled to a [0, 1] range.
    """
    scaler = MinMaxScaler()
    data[['rating', 'members', 'episodes']] = scaler.fit_transform(data[['rating', 'members','episodes']])
    return data

In [226]:
def extract_features(data):
    """
    Executes the full feature extraction pipeline, which involves preprocessing the 'genre' and 'type' columns,
    handling missing or invalid values in the 'episodes' column, and normalizing the 'rating' and 'members' columns.
    Additionally, unnecessary columns such as 'anime_id' and 'name' are dropped from the final output.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing anime data.

    Returns:
    - pd.DataFrame: The transformed DataFrame with processed and normalized features, ready for further analysis or modeling.
    """
    data = preprocess_genre(data)
    data = preprocess_type(data)
    data = preprocess_episodes(data)
    data = normalize_features(data)
    
    return data

In [227]:
data = extract_features(data= data)

In [228]:
data

Unnamed: 0,anime_id,name,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,Thriller,Vampire,Yaoi,Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,Kimi no Na wa.,0.000000,0.924370,0.197872,0,0,0,0,0,...,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,0.034673,0.911164,0.782770,1,1,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
2,28977,Gintama°,0.027518,0.909964,0.112689,1,0,0,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,9253,Steins;Gate,0.012658,0.900360,0.664325,0,0,0,0,0,...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,9969,Gintama&#039;,0.027518,0.899160,0.149186,1,0,0,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,0.000000,0.297719,0.000203,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
12290,5543,Under World,0.000000,0.313325,0.000176,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
12291,5621,Violence Gekiga David no Hoshi,0.001651,0.385354,0.000211,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,0.000000,0.397359,0.000168,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0


### Recommendation System

In [229]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime_with_threshold(data, target_anime_id, threshold_values=[0.1,0.2,0.3,0.4,0.5,0.6, 0.7, 0.8], top_n=10):
    """
    Recommends similar anime based on cosine similarity for different threshold values.

    Parameters:
    - data (pd.DataFrame): The DataFrame with processed anime data.
    - target_anime_id (int): The anime ID of the target anime to base the recommendation on.
    - threshold_values (list): A list of threshold values for similarity scores to experiment with.
    - top_n (int): The number of top recommendations to return.

    Returns:
    - dict: A dictionary with threshold values as keys and the corresponding recommendations as values.
    """
    if 'anime_id' not in data.columns:
        raise ValueError("The DataFrame must contain an 'anime_id' column.")
    
    features = data.drop(columns=['anime_id', 'name'], errors='ignore')
    target_anime_idx = data[data['anime_id'] == target_anime_id].index[0]
    target_anime_features = features.iloc[target_anime_idx].values.reshape(1, -1)
    similarity_scores = cosine_similarity(target_anime_features, features)
    
    similarity_scores = similarity_scores.flatten()
    similar_anime_indices = np.argsort(similarity_scores)[::-1]
    
    recommendations_by_threshold = {}
    
    for threshold in threshold_values:
        recommendations = []
        for idx in similar_anime_indices:
            if similarity_scores[idx] >= threshold and data.iloc[idx]['anime_id'] != target_anime_id:
                recommendations.append({
                    'anime_id': data.iloc[idx]['anime_id'],
                    'name': data.iloc[idx]['name'],
                    'similarity_score': similarity_scores[idx]
                })
            if len(recommendations) >= top_n:
                break
        recommendations_by_threshold[threshold] = recommendations
    
    return recommendations_by_threshold


In [230]:
recommendations_by_threshold = recommend_anime_with_threshold(data= data, target_anime_id= 129)

In [231]:
recommendations_by_threshold

{0.1: [{'anime_id': np.int64(479),
   'name': 'The Law of Ueki',
   'similarity_score': np.float64(0.9999094002169804)},
  {'anime_id': np.int64(31933),
   'name': 'JoJo no Kimyou na Bouken: Diamond wa Kudakenai',
   'similarity_score': np.float64(0.9319254741837084)},
  {'anime_id': np.int64(154),
   'name': 'Shaman King',
   'similarity_score': np.float64(0.9303313247380971)},
  {'anime_id': np.int64(236),
   'name': 'E&#039;s Otherwise',
   'similarity_score': np.float64(0.8715471687442468)},
  {'anime_id': np.int64(26055),
   'name': 'JoJo no Kimyou na Bouken: Stardust Crusaders 2nd Season',
   'similarity_score': np.float64(0.8477155511200375)},
  {'anime_id': np.int64(20899),
   'name': 'JoJo no Kimyou na Bouken: Stardust Crusaders',
   'similarity_score': np.float64(0.84719991072963)},
  {'anime_id': np.int64(250),
   'name': 'Konjiki no Gash Bell!!',
   'similarity_score': np.float64(0.8465227489310565)},
  {'anime_id': np.int64(874),
   'name': 'Digimon Tamers',
   'similarity

In [232]:
for threshold, recommendations in recommendations_by_threshold.items():
    print(f"Recommendations for threshold {threshold}:")
    for recommendation in recommendations:
        print(f"  - {recommendation['name']} (Similarity Score: {round(recommendation['similarity_score'],3)})")
    print()

Recommendations for threshold 0.1:
  - The Law of Ueki (Similarity Score: 1.0)
  - JoJo no Kimyou na Bouken: Diamond wa Kudakenai (Similarity Score: 0.932)
  - Shaman King (Similarity Score: 0.93)
  - E&#039;s Otherwise (Similarity Score: 0.872)
  - JoJo no Kimyou na Bouken: Stardust Crusaders 2nd Season (Similarity Score: 0.848)
  - JoJo no Kimyou na Bouken: Stardust Crusaders (Similarity Score: 0.847)
  - Konjiki no Gash Bell!! (Similarity Score: 0.847)
  - Digimon Tamers (Similarity Score: 0.846)
  - Digimon Savers (Similarity Score: 0.845)
  - Samurai Deeper Kyou (Similarity Score: 0.845)

Recommendations for threshold 0.2:
  - The Law of Ueki (Similarity Score: 1.0)
  - JoJo no Kimyou na Bouken: Diamond wa Kudakenai (Similarity Score: 0.932)
  - Shaman King (Similarity Score: 0.93)
  - E&#039;s Otherwise (Similarity Score: 0.872)
  - JoJo no Kimyou na Bouken: Stardust Crusaders 2nd Season (Similarity Score: 0.848)
  - JoJo no Kimyou na Bouken: Stardust Crusaders (Similarity Score:

### Evaluation

In [233]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

def split_data(data, test_size=0.2):
    """
    Split the dataset into training and testing sets.
    
    Parameters:
    - data (pd.DataFrame): The full dataset.
    - test_size (float): The proportion of the dataset to include in the test split.
    
    Returns:
    - pd.DataFrame, pd.DataFrame: The training and testing datasets.
    """
    return train_test_split(data, test_size=test_size, random_state=42)

In [234]:
train,  test = split_data(data= data)

In [235]:
train

Unnamed: 0,anime_id,name,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,Thriller,Vampire,Yaoi,Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
3013,5342,Asura Cryin&#039;,0.006604,0.651861,0.067662,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4253,9581,MM! Specials,0.004403,0.612245,0.021163,0,0,0,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
9791,9810,Nyani ga Nyandaa Nyandaa Kamen,0.045129,0.609844,0.000162,0,0,0,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
2629,1539,Touch: Cross Road - Kaze no Yukue,0.000000,0.665066,0.001487,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
4608,4439,Kurenai Sanshirou,0.013759,0.601441,0.000590,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,4638,Milkyway,0.000550,0.498199,0.000681,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
5191,5272,Tondemo Nezumi Daikatsuyaku,0.000000,0.583433,0.000244,0,1,0,0,0,...,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0
5390,1262,Macross II: Lovers Again,0.002752,0.576230,0.006662,0,1,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
860,22819,Aikatsu! Movie,0.000000,0.734694,0.002769,0,0,0,0,0,...,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0


In [236]:
def evaluate_recommendation_system(data, target_anime_id, threshold_values=[0.1,0.2,0.3,0.4,0.5,0.6, 0.7, 0.8], top_n=10, test_size=0.2):
    """
    Evaluates the recommendation system using precision, recall, and F1-score based on various thresholds.
    
    Parameters:
    - data (pd.DataFrame): The DataFrame with processed anime data.
    - target_anime_id (int): The anime ID of the target anime to base the recommendation on.
    - threshold_values (list): A list of threshold values for similarity scores to experiment with.
    - top_n (int): The number of top recommendations to consider.
    - test_size (float): The proportion of the dataset to use as the test set.
    
    Returns:
    - dict: A dictionary with threshold values as keys and corresponding evaluation metrics (precision, recall, F1-score).
    """
    train_data, test_data = split_data(data, test_size)

    recommendations_by_threshold = recommend_anime_with_threshold(train_data, target_anime_id, threshold_values, top_n)
    
    metrics_by_threshold = {}
    
    for threshold, recommendations in recommendations_by_threshold.items():
        # Limit recommendations to top_n items
        recommendations = recommendations[:top_n]
        
        true_relevant_anime = test_data[test_data['anime_id'].isin([rec['anime_id'] for rec in recommendations])]
        
        recommended_anime_ids = [rec['anime_id'] for rec in recommendations]

        true_positives = len([anime_id for anime_id in recommended_anime_ids if anime_id in true_relevant_anime['anime_id'].values])
        false_positives = len([anime_id for anime_id in recommended_anime_ids if anime_id not in true_relevant_anime['anime_id'].values])
        false_negatives = len([anime_id for anime_id in true_relevant_anime['anime_id'].values if anime_id not in recommended_anime_ids])

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics_by_threshold[threshold] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    
    return metrics_by_threshold


In [240]:
target_anime_id = 9969

metrics_by_threshold = evaluate_recommendation_system(data, target_anime_id)

for threshold, metrics in metrics_by_threshold.items():
    print(f"Metrics for threshold {threshold}:")
    print(f"  - Precision: {metrics['precision']:.2f}")
    print(f"  - Recall: {metrics['recall']:.2f}")
    print(f"  - F1-Score: {metrics['f1_score']:.2f}")
    print()

Metrics for threshold 0.1:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.2:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.3:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.4:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.5:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.6:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.7:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

Metrics for threshold 0.8:
  - Precision: 0.00
  - Recall: 0.00
  - F1-Score: 0.00

