In [14]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md
from flask_jsonpify import jsonpify

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

def GetData(StreamType,hosting):
    if (hosting == "online"):
        return md.read_mongo("finalyearproject",StreamType,True)
    else:
        return md.read_mongo("finalyearproject",StreamType,False)



import warnings; warnings.simplefilter('ignore')
def getCorrelation(StreamType,stream,limit,db):
    movies = ""
    if (db == "online"):
         movies = md.read_mongo("finalyearproject",StreamType,True)
    else:
         movies = md.read_mongo("finalyearproject",StreamType,False)
    movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
    ratings = md.read_mongo("finalyearproject","reviews",True)
    users = md.read_mongo("finalyearproject","users",True)
    movies = movies.rename(columns={"_id":"on"})
    movie_ratings = pd.merge(movies,ratings,on="on")
    movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})
    user_movie_rating = movies_ratings.pivot_table(index='userId', columns='tmdb', values='rate')
    ratings_mean_count = pd.DataFrame(movies_ratings.groupby('tmdb')['rate'].mean())
    ratings_mean_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('tmdb')['rate'].count())
    movieSelected = user_movie_rating[stream]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>10].sort_values('Correlation', ascending=False)
    df_movieCorrelation = df_movieCorrelation.reset_index()
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation["Correlation"] > 0]
    return list(df_movieCorrelation["tmdb"])


def getNN(StreamType,stream,limit,db):
    movies = GetData(StreamType,db)
    movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
    ratings = GetData("reviews",db)
    users = GetData("users",db)
    movies = movies.rename(columns={"_id":"on"})
    movie_ratings = pd.merge(movies,ratings,on="on")
    
    knn_ratings = ratings[["userId","on","rate"]]
    knn_movies = movies[["on","tmdb"]]
    knn_movie_ratings = pd.merge(knn_movies,knn_ratings,on="on")
    movie_rating_count = (knn_movie_ratings.
                      groupby(by=["tmdb"])["rate"].
                      count().reset_index().
                      rename(columns={'rate':'totalRating'})
                      [["tmdb","totalRating"]]
                     )
    rating_with_totalRatingCount=knn_movie_ratings.merge(movie_rating_count,left_on='tmdb',right_on='tmdb',how="inner")
    popularity_threshold=3
    #rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')
    rating_popular_movie=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRating']>popularity_threshold]
    combined = rating_popular_movie.merge(users,left_on='userId',right_on='_id',how="inner")
    combined = combined.drop_duplicates(['userId','tmdb'])
    knn_users = combined.pivot(index="tmdb",columns="_id",values="rate").fillna(0)
    knn_rating_user_csr = csr_matrix(knn_users.values)
    model_knn=NearestNeighbors(metric="cosine",algorithm="brute")
    model_knn.fit(knn_rating_user_csr)
    new_shape = knn_users.reset_index()

    recommendation = []
    streams = stream.split(",")
    for stream_id in streams:
        recommendation += list(getSingleKNN(new_shape,int(stream_id),knn_users,model_knn,limit))

    return recommendation


def getSingleKNN(new_shape,stream_id,knn_users,model_knn,limit):
    values = new_shape[new_shape["tmdb"] == stream_id].drop(columns=['tmdb']).values.reshape(1,-1)
    try:
        distances,indices=model_knn.kneighbors(values,n_neighbors=int(limit + 1))
        recommendation = []
        for i in range(0,len(distances.flatten())):
            if i!=0:
                recommendation.append(int(knn_users.index[indices.flatten()[i]]))
        return recommendation
    except:
        return ""
    



In [None]:
getCorrelation("movies",719088,5,"online")

In [17]:
list(getNN("movies","927855,818647,579792,662237",2,"online"))

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [20]:
list(getNN("movies","662237",5,"online"))

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [8]:
import mongodb as md
movies = md.read_mongo("finalyearproject","movies",True)


In [9]:
movies

Unnamed: 0,_id,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date
0,624d882287b246f81c48c8b1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[Crime, Drama, Crime, Drama]",0,0.000000,78.147,1994-09-23
1,624d882287b246f81c48c8b3,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","[Comedy, Drama, Romance, Drama, Comedy]",0,0.000000,29.896,1995-10-20
2,624d882287b246f81c48c8b5,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Crime, Drama, Crime, Drama]",0,0.000000,91.215,1972-03-14
3,624d882287b246f81c48c8b7,652837,"Josee, the Tiger and the Fish","With dreams of diving abroad, Tsuneo gets a jo...","[Animation, Drama, Romance, Animation, Drama]",6,2.500000,30.509,2020-12-25
4,624d882287b246f81c48c8cf,533514,Violet Evergarden: The Movie,As the world moves on from the war and technol...,"[Animation, Fantasy, Drama, Romance, Animation...",9,1.555556,42.120,2020-09-18
...,...,...,...,...,...,...,...,...,...
1464,624d88d361dd768818c97646,16996,17 Again,"On the brink of a midlife crisis, 30-something...","[Comedy, Comedy]",0,0.000000,91.448,2009-03-11
1465,624d88d361dd768818c9763a,141052,Justice League,Fuelled by his restored faith in humanity and ...,"[Action, Adventure, Fantasy, Science Fiction]",0,0.000000,80.543,2017-11-15
1466,624d88d361dd768818c9763e,102899,Ant-Man,Armed with the astonishing ability to shrink i...,"[Action, Adventure, Science Fiction]",0,0.000000,86.827,2015-07-14
1467,624d88d361dd768818c97644,44912,Green Lantern,"For centuries, a small but powerful force of w...","[Action, Adventure, Thriller, Science Fiction]",0,0.000000,86.645,2011-06-16
