In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/NLP Bhaiya'

/content/drive/MyDrive/NLP Bhaiya


In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import joblib
from sklearn.preprocessing import OneHotEncoder

Collaborative Filter training.

In [None]:
def read_data():
    rating_data = pd.read_csv('/content/drive/MyDrive/NLP Bhaiya/review_cleaned.csv')
    anime_data = pd.read_csv('/content/drive/MyDrive/NLP Bhaiya/animes.csv')
    anime_data.drop_duplicates(inplace = True)
    user_data = pd.read_csv('/content/drive/MyDrive/NLP Bhaiya/profiles.csv')
    user_data.drop_duplicates(inplace = True)
    return rating_data, anime_data, user_data

In [None]:
def create_item_matrix(rating_data):
    user_dict = rating_data.groupby('profile').apply(lambda x: dict(zip(x.anime_uid, x.score))).to_dict()

    unique_anime_ids = rating_data['anime_uid'].unique()
    anime_to_index = {anime_id: idx for idx, anime_id in enumerate(unique_anime_ids)}

    # Initialize the matrix
    matrix = np.zeros((len(user_dict), len(unique_anime_ids)))
    # Creating a mapping for profiles to row indices
    profile_to_index = {profile: idx for idx, profile in enumerate(user_dict.keys())}
    index_to_profile = {profile_to_index[k]:k for k in profile_to_index}
    # Filling the matrix
    for profile, ratings in user_dict.items():
        user_idx = profile_to_index[profile]
        for anime_id, score in ratings.items():
            anime_idx = anime_to_index.get(anime_id, None)
            if anime_idx is not None:
                matrix[user_idx, anime_idx] = score
    return index_to_profile, matrix, profile_to_index

In [None]:
def train_model(matrix):
    from sklearn.neighbors import NearestNeighbors
    model = NearestNeighbors(algorithm='brute')
    model.fit(matrix)
    return model

In [None]:
def Save_details(rating_data, anime_data, user_data, index_to_profile, matrix, profile_to_index):
    joblib.dump(train_model(matrix), 'knn_model_Collaborative.joblib')
    joblib.dump(profile_to_index, 'profile_to_index.joblib')
    joblib.dump(index_to_profile, 'index_to_profile.joblib')
    joblib.dump(matrix, 'user_item_matrix.joblib')
    joblib.dump(user_data, 'user_data.joblib')
    joblib.dump(rating_data, 'rating_data.joblib')
    joblib.dump(anime_data, 'anime_data.joblib')

In [None]:
rating_data, anime_data, user_data = read_data()
# Cleaning Data
anime_data['synopsis'].fillna('Not Available', inplace=True)
anime_data['score'].fillna(0, inplace=True)
anime_data['episodes'].fillna(0, inplace=True)
user_data.drop_duplicates(inplace=True)

In [None]:
index_to_profile, matrix, profile_to_index = create_item_matrix(rating_data)
Save_details(rating_data, anime_data, user_data, index_to_profile, matrix, profile_to_index)

Content Filter Training

In [None]:
import re
def clean_text(text):
    cleaned_text = text.replace("\r", " ").replace("\n", " ")
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.rstrip('[Written by MAL Rewrite]')
    return cleaned_text

In [None]:
anime_data_temp= anime_data.copy()

In [None]:
anime_data_temp['cleaned_synopsis'] = anime_data_temp['synopsis'].apply(clean_text)

In [None]:
anime_data_temp.drop(columns=['synopsis', 'aired', 'members', 'popularity', 'ranked', 'img_url', 'link', 'title'], inplace=True)

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import torch
syn_emb_list = []
for sentence in anime_data_temp['cleaned_synopsis'].values:
    tokens = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
      outputs = model(**tokens)
    sentence_embedding = outputs.pooler_output[0].numpy()
    syn_emb_list.append(sentence_embedding)

anime_data_temp['cleaned_synopsis_embedding'] = syn_emb_list

In [None]:
anime_data_temp.to_csv('anime_cleaned_synopsis_embedding.csv')

In [None]:
scaler = MinMaxScaler()
score_normalized = scaler.fit_transform(anime_data_temp['score'].values.reshape(-1, 1))
rank_normalized = scaler.fit_transform(anime_data_temp['episodes'].values.reshape(-1, 1))

encoder = OneHotEncoder(sparse=False)
genres_encoded = encoder.fit_transform(anime_data_temp['genre'].values.reshape(-1, 1))



In [None]:
from scipy.sparse import hstack
# Combine embeddings, normalized scores, ranks, and one-hot encoded genres
combined_features = np.hstack((anime_data_temp['cleaned_synopsis_embedding'].to_list(), score_normalized, rank_normalized, genres_encoded))
joblib.dump(combined_features, 'Combined_Embedding')

['Combined_Embedding']

In [None]:
from sklearn.neighbors import NearestNeighbors

# Fit KNN model
knn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
knn.fit(combined_features)
joblib.dump(knn, 'knn_model_Content.joblib')

['knn_model_Content.joblib']