In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
user_data = pd.read_csv('users.csv')
user_data = user_data[user_data["score"]!= 0]
manga_data = pd.read_csv('mangas.csv')
manga_data["genres"] = manga_data["genres"].apply(lambda x: eval(x))
#drop the row if mean is nan
manga_data = manga_data.dropna(subset=["mean"])

In [None]:
#print size of memory used by each dataframe in mb
print(user_data.memory_usage(deep=True).sum() / 1024**2)
print(manga_data.memory_usage(deep=True).sum() / 1024**2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
manga_data.info()

In [None]:
from recomendations.content_based_recommender import ContentBasedRecommender

In [None]:
rec_synopsis = ContentBasedRecommender(manga_data, manga_data['synopsis'])
rec_synopsis.fit_transform()


In [None]:
rec_synopsis.get_recommendations('Planetes', 10)

In [None]:
genre_synopsis = ContentBasedRecommender(manga_data, manga_data["genres"].apply(lambda x: " ".join(x)))
genre_synopsis.fit_transform()


In [None]:
genre_synopsis.get_recommendations('Planetes', 10)

In [None]:
manga_df = pd.read_csv('mangas.csv')
manga_df["genres"] = manga_df["genres"].apply(lambda x: eval(x))
manga_df["authors"] = manga_df["authors"].apply(lambda x: eval(x))
manga_df["start_date"] = pd.to_datetime(manga_df["start_date"], errors='coerce')
manga_df = manga_df.reindex()

user_df = pd.read_csv('users.csv')
user_df["updated_at"] = pd.to_datetime(user_df["updated_at"], errors='coerce')
user_df = user_df.reindex()

In [None]:
# Adding suffixes to handle overlapping column names
users = user_df.join(manga_df.set_index('manga_id'), on='manga_id', how='inner', lsuffix='_user', rsuffix='_manga')
users = users.reset_index()
users = users.dropna(subset=["mean"], )
users = users[users["score"]>0 ]
users = users.reset_index()
manga_df = manga_df.dropna(subset=["mean"])
manga_df = manga_df.reset_index()


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming 'genres' is already a list of genres per row. If not, convert it first.
# For example, if it's a string representation of a list, you might need something like:
# users['genres'] = users['genres'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the 'genres' column to one-hot encoded genres
genres_encoded = mlb.fit_transform(users['genres'])

# Create a DataFrame from the encoded genres with column names as genres
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

# Join the encoded genres back to the original DataFrame
users = users.join(genres_df)


In [None]:
from surprise import SVD
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(users[['user', 'manga_id', 'score']], reader)

In [None]:
from surprise.model_selection import GridSearchCV

In [None]:
svd = SVD()


In [None]:

param_grid = {'n_factors': [50, 100, 150], 'lr_all': [0.002, 0.005],
              'reg_all': [0.02, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# Use the best parameters to re-train the model
algo = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
algo.fit(trainset)

In [None]:
gs.best_estimator['rmse']

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
user_to_uid = {user: uid for uid, user in enumerate(user_df['user'].unique())}
manga_to_iid = {manga_id: iid for iid, manga_id in enumerate(manga_df['title'].unique())}

In [None]:
svd.predict(uid=user_to_uid.get("TheMissingTrex"), iid=manga_to_iid.get("Bleach")).est

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)
books_embedding = tsne.fit_transform(svd.qi)
projection = pd.DataFrame(columns=['x', 'y'], data=books_embedding)
projection['title'] = manga_df['title']


In [None]:
import plotly.express as px
import datapane as dp

fig = px.scatter(
    projection, x='x', y='y'
)
fig.show()

report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='books_scatter_plot', open=True, visibility='PUBLIC') #Publish the report

In [None]:
import datapane as dp
import difflib
import random
def get_book_id(book_title, metadata:pd.DataFrame):
    
    #existing_titles = list(metadata['title'].values)
    #closest_titles = difflib.get_close_matches(book_title, existing_titles)
    book_id = metadata[metadata['title'] == book_title].index.values[0]
    #print(book_title,book_id)
    return book_id

def plot_books(titles, plot_name):
    
    book_indices = []
    #print(titles)
    for book in titles:
        book_indices.append(get_book_id(book, manga_df))
        
    book_vector_df = projection.iloc[book_indices]
    #print(book_vector_df)
    
    fig = px.scatter(
    book_vector_df, x='x', y='y', text='title',
    )
    fig.show()
    
    report = dp.Report(dp.Plot(fig) ) #Create a report
#     report.publish(name=plot_name, open=True, visibility='PUBLIC') #Publish the report

books = list(manga_df.sort_values("mean",ascending=False)['title'][:30])
plot_books(books, plot_name='books_embedding')

In [None]:
import duckdb



# Create a DuckDB connection
conn = duckdb.connect(database=':memory:', read_only=False)

# Register the DataFrame as a view
conn.register('users_view', users)

In [None]:
def get_recommendations(user, n=10):
    recommendations = set()
    query = f"""
    SELECT title
    FROM users_view
    WHERE user = '{user}'
    """
    user_read_mangas = conn.execute(query).fetchdf().title.values
    #print(user_read_mangas)
    user_read_mangas = set(user_read_mangas)
    #print(user_read_mangas)
    for manga_title in user_read_mangas:
        """ for rec in rec_synopsis.get_recommendations(manga_title, n=n, include_series=False)['title']:
            recommendations.add(rec)
        for rec in genre_synopsis.get_recommendations(manga_title, n=n, include_series=False)['title']:
            recommendations.add(rec) """
        recommendations.update(genre_synopsis.get_recommendations(manga_title, n=n, include_series=False)['title'])
        recommendations.update(rec_synopsis.get_recommendations(manga_title, n=n, include_series=False)['title'])

    #print("done getting recommendations")
    #eliminamos los mangas similares ya leidos
   
    scores = {}
    
    for rec in recommendations:
        if rec in user_read_mangas:
            continue
        
        scores[rec] = svd.predict(uid=user_to_uid.get(user), iid=manga_to_iid.get(rec)).est
    #print("done getting scores")
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return scores[:n]

get_recommendations("TheMissingTrex", n=20)
        


In [None]:
%load_ext line_profiler

In [None]:
%lprun -f get_recommendations get_recommendations("Jseph22", n=10)

In [None]:
%lprun -f genre_synopsis.get_recommendations genre_synopsis.get_recommendations("Bleach", n=10, include_series=False)