<a href="https://colab.research.google.com/github/KennethV322/unsupervised-predict-streamlit-template/blob/master/Saved_Movie%20datav1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
from wordcloud import WordCloud, STOPWORDS
import re
import string
import nltk
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy.
import matplotlib.pyplot as plt
import seaborn as sns
# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [2]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162730 sha256=bbb98fdcf56407bccffb70177e7029bfeaff7950e351a76554a4325f46cbd6fc
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
final_df = pd.read_csv('final_df.csv')
df_trainv2 = pd.read_csv('df_trainv2.csv')

In [4]:
df_test = pd.read_csv('test.csv')

In [5]:
df_trainv2.head()

Unnamed: 0,userId,movieId,title,rating,timestamp
0,5163,57669,In Bruges (2008),4.0,1518350000.0
1,87388,57669,In Bruges (2008),3.5,1237455000.0
2,137050,57669,In Bruges (2008),4.0,1425632000.0
3,120490,57669,In Bruges (2008),4.5,1408229000.0
4,50616,57669,In Bruges (2008),4.5,1446942000.0


In [6]:
final_df['director_titlecast'] = (pd.Series(final_df[['director', 'title_cast']]
                      .fillna('')
                      .values.tolist()).str.join(' '))

titles = final_df['title']
indices = pd.Series(final_df.index, index=final_df['title'])

In [7]:
indices.head()

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
dtype: int64

In [8]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0, stop_words='english')

# Produce a feature matrix, where each row corresponds to a book,
# with TF-IDF features as columns
tf_movietags_matrix = tf.fit_transform(final_df['director_titlecast'])

In [9]:
cosine_sim_movietags = cosine_similarity(tf_movietags_matrix,
                                        tf_movietags_matrix)
print (cosine_sim_movietags.shape)

(24866, 24866)


In [10]:
cosine_sim_movietags[:5]

array([[1.        , 0.        , 0.00481673, ..., 0.00606498, 0.00317293,
        0.01385015],
       [0.        , 1.        , 0.00251314, ..., 0.        , 0.        ,
        0.        ],
       [0.00481673, 0.00251314, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01596706, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00451436, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [11]:
def content_generate_top_N_recommendations(movie_title, N=10):
    # Convert the string book title to a numeric index for our
    # similarity matrix
    b_idx = indices[movie_title]
    # Extract all similarity values computed with the reference book title
    sim_scores = list(enumerate(cosine_sim_movietags[b_idx]))
    # Sort the values, keeping a copy of the original index of each value
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Select the top-N values for recommendation
    sim_scores = sim_scores[1:N]
    # Collect indexes
    book_indices = [i[0] for i in sim_scores]
    # Convert the indexes back into titles
    return titles.iloc[book_indices]

In [12]:
def content_generate_rating_estimate(movie_title, user, rating_data, k=20, threshold=0.0):
    # Convert the book title to a numeric index for our
    # similarity matrix
    b_idx = indices[movie_title]
    neighbors = [] # <-- Stores our collection of similarity values

    # Gather the similarity ratings between each book the user has rated
    # and the reference book
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim_movietags[b_idx-1, indices[row['title']]-1]
        neighbors.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scores and
    # user item ratings.
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user.
        # We use the average rating for the reference item as a proxy in this case
        predictedRating = np.mean(rating_data[rating_data['title']==movie_title]['rating'])
    return predictedRating

In [13]:
df_trainv2[df_trainv2['userId'] == 106362][3:10]

Unnamed: 0,userId,movieId,title,rating,timestamp
21731,106362,1997,"Exorcist, The (1973)",4.0,1367023000.0
172788,106362,2012,Back to the Future Part III (1990),4.0,1367022000.0
200760,106362,103042,Man of Steel (2013),3.5,1397833000.0
234385,106362,4069,"Wedding Planner, The (2001)",3.0,1397841000.0
267291,106362,3471,Close Encounters of the Third Kind (1977),3.5,1304098000.0
269725,106362,783,"Hunchback of Notre Dame, The (1996)",3.5,1367965000.0
282597,106362,40815,Harry Potter and the Goblet of Fire (2005),3.5,1367177000.0


In [14]:
title = "Exorcist, The (1973)"
actual_rating = df_trainv2[(df_trainv2['userId'] == 106362) & (df_trainv2['title'] == title)]['rating'].values[0]
pred_rating = content_generate_rating_estimate(movie_title=title, user=106362, rating_data=df_trainv2)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

Title - Exorcist, The (1973)
---
Actual rating: 		 4.0
Predicted rating: 	 3.728864432216108


In [22]:
from surprise import Dataset, Reader

from surprise.model_selection import train_test_split

from surprise import SVD

from surprise import accuracy

df_sample = df_trainv2.sample(frac=0.5)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_sample[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.6644
1.664431759123409


In [23]:
unique_userid = df_test['userId'].unique()
unique_movieId = df_test['movieId'].unique()

test_pred = [model.predict(user_id, movie_id).est
             for user_id in unique_userid for movie_id in unique_movieId]

sub_df = pd.DataFrame({'Id':[f"{user_id}_{movie_Id}"
                      for user_id in unique_userid for movie_Id in unique_movieId],'rating':test_pred})

sub_df.to_csv('sub1.csv', index=False)

KeyboardInterrupt: 