In [3]:
import numpy as np 
import os 
import pandas as pd 
import re
import string
import seaborn as sns
import sklearn
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import surprise
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

In [4]:
df_anime = pd.read_csv("../data/df_anime_export.csv")
df_profile = pd.read_csv("../data/df_profile_export.csv")
df_review = pd.read_csv("../data/df_review_export.csv")
df_an_pro_rev = pd.read_csv("../data/merged_rev_df_export.csv")
df_an_pro = pd.read_csv("../data/merged_df_export.csv")
df_main = pd.read_csv("../data/df_prof_rev_anime_clean.csv")

In [5]:
df_main.head(1)

Unnamed: 0.2,Unnamed: 0.1,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,synopsis_processed,profile,review_score,review_score_indepth,text_preprocessed,Unnamed: 0,gender,birthday,favorites_anime
0,0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,"following participation inter-high , karasuno ...",skrn,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...","art : great , especially action critical momen...",2,,,"['918', '2904', '11741', '17074', '23273', '32..."


## Collab Filter a bit cleaner

In [6]:
df_touse = df_main[["profile", "uid", "review_score"]]

In [7]:
df_anime.head(1)

Unnamed: 0.1,Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,synopsis_processed
0,0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,"following participation inter-high , karasuno ..."


In [8]:
df_anime_touse = df_anime[["uid", "title"]]

In [9]:
df = Dataset.load_from_df(df_touse, Reader(rating_scale=(1,10)))
trainset = df.build_full_trainset()

In [10]:
model = SVD()
cross_validate(model, df, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9744  1.9809  1.9815  1.9761  1.9767  1.9779  0.0028  
MAE (testset)     1.5180  1.5278  1.5262  1.5258  1.5292  1.5254  0.0039  
Fit time          1.24    1.28    1.45    1.40    1.50    1.38    0.10    
Test time         0.16    0.16    0.18    0.17    0.17    0.17    0.01    


{'test_rmse': array([1.97437273, 1.98091181, 1.98152921, 1.976051  , 1.97668805]),
 'test_mae': array([1.51798043, 1.52777195, 1.52618831, 1.52577896, 1.52923074]),
 'fit_time': (1.2443084716796875,
  1.2752859592437744,
  1.4513254165649414,
  1.4033153057098389,
  1.5043370723724365),
 'test_time': (0.16103625297546387,
  0.1580350399017334,
  0.1780407428741455,
  0.1670377254486084,
  0.1670379638671875)}

In [11]:
#single prediction
model.predict('skrn', 30)

Prediction(uid='skrn', iid=30, r_ui=None, est=8.01315845147726, details={'was_impossible': False})

In [12]:
#overall highest predicted not seen shows based on whats seen
user_id = 'skrn'
all_anime = df_touse['uid'].unique()
watched = df_touse[df_touse['profile']==user_id].uid
not_watched = [anime for anime in all_anime if anime not in watched]

# predict
score = [model.predict(user_id, anime_id) for anime_id in not_watched]
anime_id = []
pred_score = []
for i in range(0, len(score)):
    anime_id.append(score[i].iid)
    pred_score.append(score[i].est)
df_pred = pd.DataFrame({'uid':anime_id, 'pred_score':pred_score})

df_pred_real = df_pred.sort_values('pred_score', ascending=False).head(10)

df_pred_real.merge(df_anime_touse, how='left', on='uid')

Unnamed: 0,uid,pred_score,title
0,263,8.989224,Hajime no Ippo
1,7311,8.951124,Suzumiya Haruhi no Shoushitsu
2,5365,8.848104,Tsumiki no Ie
3,37510,8.837533,Mob Psycho 100 II
4,13759,8.825057,Sakura-sou no Pet na Kanojo
5,44,8.799102,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
6,1210,8.792369,NHK ni Youkoso!
7,918,8.747168,Gintama
8,32281,8.741115,Kimi no Na wa.
9,28977,8.728587,Gintama°


In [13]:
df_touse

Unnamed: 0,profile,uid,review_score
0,skrn,28891,7
1,skrn,32935,8
2,skrn,30276,7
3,skrn,4107,7
4,skrn,4081,4
...,...,...,...
130514,sensei69,33082,5
130515,ssjtk,33082,8
130516,Ground_zero,33082,9
130517,ClawViper,33082,7


In [14]:
df_touse['uid'][0]

28891

## Testing Accuracy

In [20]:
# Split the data into chunks
chunk_size = 1000  # Adjust the chunk size as per your system's memory capacity
num_chunks = int(np.ceil(len(df_touse) / chunk_size))
chunks = np.array_split(df_touse, num_chunks)

# Initialize the model
model = SVD()

# Iterate over the chunks and make predictions
predictions = []
for chunk in chunks:
    # Prepare the data for the chunk
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(chunk, reader)
    trainset = data.build_full_trainset()

    # Fit the model
    model.fit(trainset)

    # Generate predictions for the chunk
    testset = trainset.build_anti_testset()
    chunk_predictions = model.test(testset)

    # Append the predictions to the overall list
    predictions.extend(chunk_predictions)

# Convert the predictions to a DataFrame
df_predictions = pd.DataFrame(predictions)

In [None]:
actual_ratings = df_touse['review_score'].values  # True ratings from the DataFrame

# Calculate accuracy percentage
threshold = 0.5
accurate_predictions = 0

for i in range(len(predicted_ratings)):
    diff = abs(predicted_ratings[i] - actual_ratings[i])
    if diff <= threshold:
        accurate_predictions += 1

accuracy_percentage = (accurate_predictions / len(predicted_ratings)) * 100

print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")