In [1]:
import numpy as np 
import os 
import pandas as pd 
import re
import string
import seaborn as sns
import sklearn
from surprise import Reader, Dataset, SVD, KNNBasic

from surprise.accuracy import rmse


from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
import surprise
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [2]:
df_anime = pd.read_csv("../data/df_anime_export.csv")
df_profile = pd.read_csv("../data/df_profile_export.csv")
df_review = pd.read_csv("../data/df_review_export.csv")
df_an_pro_rev = pd.read_csv("../data/merged_rev_df_export.csv")
df_an_pro = pd.read_csv("../data/merged_df_export.csv")
df_main = pd.read_csv("../data/df_prof_rev_anime_clean.csv")

In [3]:
df_main.head(1)

Unnamed: 0.2,Unnamed: 0.1,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,synopsis_processed,profile,review_score,review_score_indepth,text_preprocessed,Unnamed: 0,gender,birthday,favorites_anime
0,0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,"following participation inter-high , karasuno ...",skrn,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...","art : great , especially action critical momen...",2,,,"['918', '2904', '11741', '17074', '23273', '32..."


## Collab Filter a bit cleaner

In [4]:
df_touse = df_main[["profile", "uid", "review_score"]]

In [5]:
df_anime.head(1)

Unnamed: 0.1,Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,synopsis_processed
0,0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,"following participation inter-high , karasuno ..."


In [6]:
df_anime_touse = df_anime[["uid", "title"]]

In [7]:
df = Dataset.load_from_df(df_touse, Reader(rating_scale=(1,10)))
trainset = df.build_full_trainset()

In [8]:
model = SVD()
cross_validate(model, df, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9846  1.9689  1.9738  1.9845  1.9787  1.9781  0.0061  
MAE (testset)     1.5254  1.5209  1.5223  1.5326  1.5244  1.5251  0.0041  
Fit time          1.30    1.35    1.39    1.41    1.43    1.38    0.05    
Test time         0.17    0.33    0.32    0.35    0.16    0.27    0.08    


{'test_rmse': array([1.98463759, 1.96894694, 1.97384693, 1.98450158, 1.97871039]),
 'test_mae': array([1.52541688, 1.52086623, 1.52228663, 1.5325932 , 1.52440154]),
 'fit_time': (1.3022940158843994,
  1.3493025302886963,
  1.392310619354248,
  1.414315938949585,
  1.4273185729980469),
 'test_time': (0.16603636741638184,
  0.33007240295410156,
  0.32407236099243164,
  0.34807896614074707,
  0.16403770446777344)}

In [9]:
#single prediction
model.predict('skrn', 30)

Prediction(uid='skrn', iid=30, r_ui=None, est=6.460253938058285, details={'was_impossible': False})

In [10]:
#overall highest predicted not seen shows based on whats seen
user_id = 'skrn'
all_anime = df_touse['uid'].unique()
watched = df_touse[df_touse['profile']==user_id].uid
not_watched = [anime for anime in all_anime if anime not in watched]

# predict
score = [model.predict(user_id, anime_id) for anime_id in not_watched]
anime_id = []
pred_score = []
for i in range(0, len(score)):
    anime_id.append(score[i].iid)
    pred_score.append(score[i].est)
df_pred = pd.DataFrame({'uid':anime_id, 'pred_score':pred_score})

df_pred_real = df_pred.sort_values('pred_score', ascending=False).head(10)

df_pred_real.merge(df_anime_touse, how='left', on='uid')

Unnamed: 0,uid,pred_score,title
0,3297,8.699369,Aria The Origination
1,28977,8.68623,Gintama°
2,2251,8.681022,Baccano!
3,33,8.581825,Kenpuu Denki Berserk
4,10162,8.57037,Usagi Drop
5,199,8.566371,Sen to Chihiro no Kamikakushi
6,30654,8.550124,Ansatsu Kyoushitsu 2nd Season
7,35180,8.545835,3-gatsu no Lion 2nd Season
8,35839,8.531832,Sora yori mo Tooi Basho
9,5114,8.529381,Fullmetal Alchemist: Brotherhood


In [11]:
df_touse

Unnamed: 0,profile,uid,review_score
0,skrn,28891,7
1,skrn,32935,8
2,skrn,30276,7
3,skrn,4107,7
4,skrn,4081,4
...,...,...,...
130514,sensei69,33082,5
130515,ssjtk,33082,8
130516,Ground_zero,33082,9
130517,ClawViper,33082,7


In [12]:
df_touse['uid'][0]

28891

## Testing Accuracy

In [13]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_touse, reader)

# Build the full training set
trainset = data.build_full_trainset()

# Train the SVD model
model = SVD()
model.fit(trainset)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Chunk the testset into groups of 1000
chunk_size = 1000
num_chunks = len(testset) // chunk_size
if len(testset) % chunk_size != 0:
    num_chunks += 1

# Calculate accuracy for each chunk
accuracies = []
tolerance = 1  # Tolerance for correctness

for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = start_idx + chunk_size
    chunk = testset[start_idx:end_idx]
    predictions = model.test(chunk)

    # Calculate accuracy for the chunk
    num_correct = 0
    for prediction in predictions:
        uid = prediction.uid
        iid = prediction.iid
        actual_rating = prediction.r_ui
        predicted_rating = round(prediction.est)
        if abs(actual_rating - predicted_rating) <= tolerance:
            num_correct += 1

    # Calculate accuracy for the chunk
    accuracy = num_correct / len(chunk)
    accuracies.append(accuracy)

# Calculate the average accuracy across all chunks
average_accuracy = np.mean(accuracies)

print('Average Accuracy:', average_accuracy)

Average Accuracy: 0.888059829059829
