In [None]:
import pandas as pd 

tags = pd.read_csv("Tags.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
links = pd.read_csv("links.csv")

In [None]:
ratings

In [None]:
movies

In [None]:
# Behåll
movies["genres"] = movies["genres"].str.replace("|", " ", regex=False)

In [None]:
# Behåll
tags = tags[tags['tag'].apply(lambda x: isinstance(x, str))]
merged_tags = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()

In [None]:
# Behåll

filtered_movies = pd.merge(movies, merged_tags, on="movieId", how="inner")
filtered_movies = pd.merge(filtered_movies, links, on="movieId", how="inner")
filtered_movies

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd 
import numpy as np
import re
import unicodedata
from scipy.sparse import hstack, csr_matrix


def load_data():
    tags = pd.read_csv("Tags.csv")
    movies = pd.read_csv("movies.csv")
    ratings = pd.read_csv("ratings.csv")
    links = pd.read_csv("links.csv")

    return tags, movies, ratings, links


In [None]:
# Behåll

def clean_text(text):
    if type(text)==str:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode() 
        # converts to lowercase
        text = text.lower()
    
        # removes special characters, numbers, and punctuation
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

In [3]:
def data_prep(tags, movies, links):
    movies["genres"] = movies["genres"].str.replace("|", " ", regex=False)
    tags = tags[tags['tag'].apply(lambda x: isinstance(x, str))]
    merged_tags = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()
    filtered_movies = pd.merge(movies, merged_tags, on="movieId", how="inner")
    filtered_movies = pd.merge(filtered_movies, links, on="movieId", how="inner")

    text_columns = filtered_movies.columns[filtered_movies.dtypes == 'object']
    for col in text_columns:
        filtered_movies[col] = filtered_movies[col].apply(clean_text)
    
    return filtered_movies

In [None]:
def tfidf(filtered_movies):
    text_data1 = filtered_movies["genres"].tolist()
    vectorizer1 = TfidfVectorizer()
    tfidf_matrix1 = vectorizer1.fit_transform(text_data1)

    text_data2 = filtered_movies["tag"].tolist()
    vectorizer2 = TfidfVectorizer()
    tfidf_matrix2 = vectorizer2.fit_transform(text_data2) # ändarade vectorizer1 till 2

    combined_matrix = hstack((tfidf_matrix1, tfidf_matrix2))

    model_knn = NearestNeighbors(metric = "cosine", algorithm = "auto")
    model_knn.fit(combined_matrix)

    return model_knn, combined_matrix

In [5]:
def rating_features(ratings, movies):
    x = ratings["userId"].value_counts() > 200
    y = x[x].index
    ratings = ratings[ratings["userId"].isin(y)]
    ratings_with_movies = ratings.merge(movies, on= "movieId")

    num_rating = ratings_with_movies.groupby("title")["rating"].count().reset_index()

    num_rating.rename(columns={"rating": "num_of_rating"}, inplace=True)
    final_rating = ratings_with_movies.merge(num_rating, on="title") # sätter ihop dataset som både har title kolumner på title
    final_rating = final_rating.drop(columns=["title", "genres"])

    return final_rating

In [17]:
def get_recommendations(movie_name, combined_matrix, model_knn, filtered_movies, top_n = 5):

    movie_id = filtered_movies[filtered_movies["title"] == movie_name].index[0]

    # Get the row corresponding to the movie of interest
    movie = combined_matrix[movie_id, :]
    
    # Get the indices and distances of the nearest neighbors
    distances, indices = model_knn.kneighbors(movie.reshape(1, -1), n_neighbors= top_n+1)
    
    # Return the movie titles corresponding to the nearest neighbors
    return filtered_movies[["movieId","title", "genres", "tmdbId"]].iloc[indices[0][0:top_n+1]]

In [16]:
def make_sparse(list, final_rating):
    top50_n_ratings = pd.merge(list, final_rating, on="movieId", how="inner")

    movie_pivot = top50_n_ratings.pivot_table(columns="userId", index="title", values="rating")
    movie_pivot.fillna(0, inplace=True)

    movie_sparse = csr_matrix(movie_pivot)

    return movie_pivot, movie_sparse, top50_n_ratings

In [18]:
def make_model(movie_sparse, movie_pivot):
    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(movie_sparse)

    movie_names = movie_pivot.index.tolist() # Gjorde till en lista för streamlit

    return movie_names, model

# movie_names, model = make_model(movie_sparse, movie_pivot)

In [19]:
def recommend(movie_name, movie_pivot, model, top50_n_ratings):
    movie_list = []

    movie_id = np.where(movie_pivot.index == movie_name)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=6) 
    
    # poster_url = get_poster(suggestion, book_pivot, final_rating)
    
    for i in range(len(suggestion)):
        movies = movie_pivot.index[suggestion[i]]
        for j in movies:
            movie_list.append(j)
    
    return movie_list

In [None]:
movie_list[1:]

In [35]:
def main():

    tags, movies, ratings, links = load_data()
    filtered_movies = data_prep(tags, movies, links)
    model_knn, combined_matrix = tfidf(filtered_movies)
    final_rating = rating_features(ratings, movies)

    return combined_matrix, model_knn, final_rating, filtered_movies

In [36]:
if __name__ == "__main__":
   combined_matrix, model_knn, final_rating, filtered_movies = main()

In [None]:
def choose_title(combined_matrix, model_knn, final_rating, filtered_movies):

    movie_name = "goldeneye 1995"

    list = get_recommendations(movie_name, combined_matrix, model_knn, filtered_movies, top_n = 50) # movie_id är filmen som man väljer
    movie_pivot, movie_sparse, top50_n_ratings = make_sparse(list, final_rating)
    movie_names, model = make_model(movie_sparse, movie_pivot)
    x = recommend(movie_name, movie_pivot, model, top50_n_ratings)
    
    return print(x)

x = choose_title(combined_matrix, model_knn, final_rating, filtered_movies)
x


['goldeneye 1995',
 'tomorrow never dies 1997',
 'rock the 1996',
 'world is not enough the 1999',
 'die hard 2 1990',
 'hunt for red october the 1990']

In [None]:
# Behåll

# text_columns = filtered_movies.columns[filtered_movies.dtypes == 'object']
# for col in text_columns:
#     filtered_movies[col] = filtered_movies[col].apply(clean_text)

In [None]:
def recommend(book_names, book_pivot, model, top50_n_ratings):
    book_list = []

    book_id = np.where(book_pivot.index == book_names)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6) 
    
    # poster_url = get_poster(suggestion, book_pivot, final_rating)
    
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            book_list.append(j)
    
    return book_list
#, poster_url
movie_name = "heat 1995"
movie_list = recommend(movie_name, movie_pivot, model, top50_n_ratings)

In [24]:
filtered_movies

NameError: name 'filtered_movies' is not defined

In [None]:
# Används inte

# tfidf_movies = filtered_movies.copy()
# tfidf_movies["tfidf"] = filtered_movies["genres"] + " " + filtered_movies["tag"]
# tfidf_movies

In [None]:
# Behåll

from sklearn.feature_extraction.text import TfidfVectorizer

text_data1 = filtered_movies["genres"].tolist()
vectorizer1 = TfidfVectorizer()
tfidf_matrix1 = vectorizer1.fit_transform(text_data1)

text_data2 = filtered_movies["tag"].tolist()
vectorizer2 = TfidfVectorizer()
tfidf_matrix2 = vectorizer1.fit_transform(text_data2)

In [None]:
# Behåll

from scipy.sparse import hstack
combined_matrix = hstack((tfidf_matrix1, tfidf_matrix2))

In [None]:
# Behåll

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = "cosine", algorithm = "auto")
model_knn.fit(combined_matrix)

In [None]:
#================================================================================================
# test

In [None]:
from scipy.sparse import csr_matrix

x = ratings["userId"].value_counts() > 200
y = x[x].index
ratings = ratings[ratings["userId"].isin(y)]
ratings_with_movies = ratings.merge(movies, on= "movieId")

num_rating = ratings_with_movies.groupby("title")["rating"].count().reset_index()

num_rating.rename(columns={"rating": "num_of_rating"}, inplace=True)
final_rating = ratings_with_movies.merge(num_rating, on="title") # sätter ihop dataset som både har title kolumner på title
final_rating

# final_rating = final_rating[final_rating["num_of_rating"]>50] # tar bort alla böcker med mindre än 50 ratings
# final_rating.drop_duplicates(["userId", "title"], inplace=True) # tar bort alla dubletter
"""
Av de 50 som jag får ut av tfidf kan man kanske lägga in titlarna med final rating och sen köra pivot tabell och 
sen en nearestneighbor till på de titlarna.
"""
# movie_pivot = final_rating.pivot_table(columns="userId", index="title", values="rating")
# movie_pivot.fillna(0, inplace=True)

# movie_sparse = csr_matrix(movie_pivot)

In [None]:
#=================================================================================================

In [None]:
""" Funkar bäst hittils -= TF-IDF =- """
# Behåll

def get_recommendations(movie_id, combined_matrix, model_knn, top_n = 5):
    # Get the row corresponding to the movie of interest
    movie = combined_matrix[movie_id, :]
    
    # Get the indices and distances of the nearest neighbors
    distances, indices = model_knn.kneighbors(movie.reshape(1, -1), n_neighbors= top_n+1)
    
    # Return the movie titles corresponding to the nearest neighbors
    return filtered_movies[["movieId","title", "genres", "tmdbId"]].iloc[indices[0][0:top_n+1]]

movie_name = "heat 1995"
movie_id = filtered_movies[filtered_movies["title"] == movie_name].index[0]


print("Recommendations for movie:", filtered_movies["title"].iloc[movie_id])
list = get_recommendations(movie_id, combined_matrix, model_knn, top_n=50)
list

In [None]:
# list
# top50_n_ratings = pd.merge(list, ratings, on="movieId", how="inner")
# top50_n_ratings

In [None]:
""" jag skulle kunna nöja mig med ett resultat från de 50 som har flest ratings och med högst medelvärde. 
Men jag forskar vidare om det blir ett bättre resultat med någon model med ratings. 
"""
# top_mean = top50_n_ratings.groupby(["movieId","title", "tmdbId"])["rating"].agg(["count", "mean"]).reset_index()
# sorted_mean = top_mean.nlargest(5, "mean")
# sorted_mean

In [None]:
#===============================================================================================

In [None]:
final_rating = final_rating.drop(columns=["title", "genres"])

In [None]:
"""Jag forskar på om man kan med tf idf få fram de 50 mest relavanta filmerna och sen med rating få fram 
de 5 med mest relavant rating. """

top50_n_ratings = pd.merge(list, final_rating, on="movieId", how="inner")

movie_pivot = top50_n_ratings.pivot_table(columns="userId", index="title", values="rating")
movie_pivot.fillna(0, inplace=True)

movie_sparse = csr_matrix(movie_pivot)

In [None]:
top50_n_ratings


In [None]:
def make_model(movie_sparse, movie_pivot):
    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(movie_sparse)

    movie_names = movie_pivot.index.tolist() # Gjorde till en lista för streamlit

    return movie_names, model

movie_names, model = make_model(movie_sparse, movie_pivot)

In [None]:
import numpy as np

def recommend(movie_name, movie_pivot, model, top50_n_ratings):
    movie_list = []

    book_id = np.where(movie_pivot.index == movie_names)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6) 
    
    # poster_url = get_poster(suggestion, book_pivot, final_rating)
    
    for i in range(len(suggestion)):
        movies = movie_pivot.index[suggestion[i]]
        for j in movies:
            movie_list.append(j)
    
    return movie_list
#, poster_url
movie_name = "heat 1995"
movie_list = recommend(movie_name, movie_pivot, model, top50_n_ratings)

In [None]:
""" slutlig lista på de fem efter TF IDF och kneighbors, oklart hur lång tid uträkningen tar... 
kanske kneighbors inte är den mest optimala här.. """

movie_list[1:]

In [None]:
""" ============================================================================================="""

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

combined_matrix

vectorizer = TfidfVectorizer()
model = vectorizer.fit_transform(tfidf_movies["tfidf"])

In [None]:
import numpy as np

# def search(title):
title = "toy story 1995"
query_vector = vectorizer.transform([title])
similarity = cosine_similarity(query_vector, model).flatten()
indices = np.argpartition(similarity, -50)[-50:]
result = filtered_movies.iloc[indices][["title", "genres"]].values

result

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value= "toy story 1995",
    description = "Movie:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)