In [1]:
import pandas as pd
from scipy.sparse import csr_matrix


tags = pd.read_csv("tags.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [2]:
movies["genres"] = movies["genres"].str.replace("|", " ", regex=False)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action Drama
86533,288971,Ouija Japan (2021),Action Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime Thriller


In [3]:
# filtererad_data = tags[tags["tag"].str.contains("^[a-zA-Z]+$", na=False)]
tags = tags[tags['tag'].apply(lambda x: isinstance(x, str))]
# tags

In [4]:
merged_tags = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()
merged_tags = merged_tags[merged_tags['tag'].apply(lambda x: isinstance(x, str))]

merged_tags

Unnamed: 0,movieId,tag
0,1,emotional Os dois viram loyal friend remote co...
1,2,pigtails plant discount store namesake dybbuk ...
2,3,Walter Matthau Minnesota old people that is ac...
3,4,girl movie slurs revenge based on novel or boo...
4,5,steve martin worst movies ever humorous weddin...
...,...,...
53447,288765,tw suicide apocalypse bad science survival plo...
53448,288779,Don Camillo Series
53449,288849,short film animation addiction
53450,288937,anime


In [5]:
filtered_movies = pd.merge(movies, merged_tags, on="movieId", how="inner")
filtered_movies


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,emotional Os dois viram loyal friend remote co...
1,2,Jumanji (1995),Adventure Children Fantasy,pigtails plant discount store namesake dybbuk ...
2,3,Grumpier Old Men (1995),Comedy Romance,Walter Matthau Minnesota old people that is ac...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,girl movie slurs revenge based on novel or boo...
4,5,Father of the Bride Part II (1995),Comedy,steve martin worst movies ever humorous weddin...
...,...,...,...,...
53447,288765,Bird Box Barcelona (2023),Horror Thriller,tw suicide apocalypse bad science survival plo...
53448,288779,Don Camillo: Monsignor (1961),Comedy,Don Camillo Series
53449,288849,Colaholic (2018),(no genres listed),short film animation addiction
53450,288937,Blue Thermal (2022),Animation Comedy Drama Romance,anime


In [6]:
filtered_movies["tfidf"] = filtered_movies["genres"] + " " + filtered_movies["tag"]
filtered_movies["tfidf"] = filtered_movies["tfidf"].apply(lambda x: x.lower())
filtered_movies
# lägg till tmdb url här


Unnamed: 0,movieId,title,genres,tag,tfidf
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,emotional Os dois viram loyal friend remote co...,adventure animation children comedy fantasy em...
1,2,Jumanji (1995),Adventure Children Fantasy,pigtails plant discount store namesake dybbuk ...,adventure children fantasy pigtails plant disc...
2,3,Grumpier Old Men (1995),Comedy Romance,Walter Matthau Minnesota old people that is ac...,comedy romance walter matthau minnesota old pe...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,girl movie slurs revenge based on novel or boo...,comedy drama romance girl movie slurs revenge ...
4,5,Father of the Bride Part II (1995),Comedy,steve martin worst movies ever humorous weddin...,comedy steve martin worst movies ever humorous...
...,...,...,...,...,...
53447,288765,Bird Box Barcelona (2023),Horror Thriller,tw suicide apocalypse bad science survival plo...,horror thriller tw suicide apocalypse bad scie...
53448,288779,Don Camillo: Monsignor (1961),Comedy,Don Camillo Series,comedy don camillo series
53449,288849,Colaholic (2018),(no genres listed),short film animation addiction,(no genres listed) short film animation addiction
53450,288937,Blue Thermal (2022),Animation Comedy Drama Romance,anime,animation comedy drama romance anime


In [7]:
import unicodedata
import re

def clean_text(text):
    # Normalisera till NFKD-form och filtrera bort icke-ASCII-tecken
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    
    # Ta bort icke-bokstäver, siffror och mellanslag
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Ta bort extra mellanslag
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [8]:
filtered_movies["tfidf"] = filtered_movies["tfidf"].apply(clean_text)
filtered_movies = filtered_movies[["movieId", "title", "tfidf"]]

In [9]:
filtered_movies

Unnamed: 0,movieId,title,tfidf
0,1,Toy Story (1995),adventure animation children comedy fantasy em...
1,2,Jumanji (1995),adventure children fantasy pigtails plant disc...
2,3,Grumpier Old Men (1995),comedy romance walter matthau minnesota old pe...
3,4,Waiting to Exhale (1995),comedy drama romance girl movie slurs revenge ...
4,5,Father of the Bride Part II (1995),comedy steve martin worst movies ever humorous...
...,...,...,...
53447,288765,Bird Box Barcelona (2023),horror thriller tw suicide apocalypse bad scie...
53448,288779,Don Camillo: Monsignor (1961),comedy don camillo series
53449,288849,Colaholic (2018),no genres listed short film animation addiction
53450,288937,Blue Thermal (2022),animation comedy drama romance anime


In [10]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086


In [22]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy


In [11]:
""" Funktionen tar väldigt lång tid"""

def extract_features(movies, ratings):
    x = ratings["userId"].value_counts() > 200
    y = x[x].index
    ratings = ratings[ratings["userId"].isin(y)]
    ratings_with_movies = ratings.merge(movies, on="movieId")

    num_rating = ratings_with_movies.groupby("title")["rating"].count().reset_index()

    num_rating.rename(columns={"rating": "num_of_rating"}, inplace=True)
    final_rating = ratings_with_movies.merge(num_rating, on="title")
    final_rating = final_rating[final_rating["num_of_rating"] > 50]
    final_rating.drop_duplicates(["userId", "title"], inplace=True)

    movie_pivot = final_rating.pivot_table(columns="userId", index="title", values="rating")
    movie_pivot.fillna(0, inplace=True)

    movie_sparse = csr_matrix(movie_pivot)

    return movie_sparse, movie_pivot, final_rating

movie_sparse, movie_pivot, final_rating = extract_features(movies, ratings)

In [12]:
final_rating.shape, movie_pivot.shape

((20738515, 7), (15226, 42608))

In [13]:
# movie_pivot.head(1) # filmen i en kolumn och en användare per kolumn med rating per film.
# movie_sparse # <15226x42608 sparse matrix of type '<class 'numpy.float64'>' with 20738515 stored elements in Compressed Sparse Row format>
final_rating = final_rating[["userId", "movieId", "rating", "title"]]
final_rating.head(20)

Unnamed: 0,userId,movieId,rating,title
0,21,1,3.0,Toy Story (1995)
1,21,2,3.0,Jumanji (1995)
2,21,10,3.5,GoldenEye (1995)
3,21,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
4,21,39,2.5,Clueless (1995)
5,21,48,3.0,Pocahontas (1995)
6,21,62,4.0,Mr. Holland's Opus (1995)
7,21,104,4.0,Happy Gilmore (1996)
8,21,141,4.0,"Birdcage, The (1996)"
9,21,150,4.0,Apollo 13 (1995)


In [14]:
from sklearn.neighbors import NearestNeighbors

In [15]:
def make_model(movie_sparse, movie_pivot):
    model = NearestNeighbors(metric="cosine", algorithm="auto")
    model.fit(movie_sparse)
    movie_names = movie_pivot.index.tolist()

    return movie_names, model

movie_names, model = make_model(movie_sparse, movie_pivot)

In [16]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
# from surprise import SVD, Dataset, Reader
# from surprise.model_selection import train_test_split

In [17]:
# def recommend(movie_names, book_pivot, model, final_rating): med posters?

def recommend(selected_movies, movie_pivot, model, n_neighbors=6):
    movie_list = []

    movie_id = np.where(movie_pivot.index == selected_movies)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors = n_neighbors)
    
    for i in range(len(suggestion)):
        movies = movie_pivot.index[suggestion[i]]
        for j in movies:
            movie_list.append(j)

    return movie_list

""" ==================================================================================================="""
""" TEST """

# reader = Reader(rating_scale=(0.5, 5.0))
# data = Dataset.load_from_df(final_rating[['userId', 'title', 'rating']], reader)
# trainset, testset = train_test_split(data, test_size=0.2)

# svd = SVD()
# svd.fit(trainset)

# def predict_rating(movie_list, svd_model, user_id=1, top_n=5):
#     predicted_ratings = {}

#     for movie in movie_list:
#         movie_id = movie_to_id[movie]  # Omvandla titel till ID
#         pred = svd_model.predict(user_id, movie_id).est  # Förutsäg betyg
#         predicted_ratings[movie] = pred

#     # Sortera filmer efter högst betyg och returnera de 5 bästa
#     sorted_movies = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:top_n]
    
#     return sorted_movies  # Returnerar de 5 mest relevanta filmerna


# def hybrid_recommend(selected_movie, movie_pivot, model, svd_model):
#     # 1. Hämta liknande filmer via KNN
#     knn_recommendations = recommend(selected_movie, movie_pivot, model)
    
#     top_5_movies = predict_rating(recommendations, svd, user_id=1, top_n=5)
#     print(top_5_movies)  # Skriver ut de 5 bästa filmerna
    
#     return final_recommendations

# selected_movie = "Batman Forever (1995)"
# recommendations = hybrid_recommend(selected_movie, movie_pivot, model, svd)

def recommend2(selected_movies, movie_pivot, model, n_neighbors = 200):

    movie_id = np.where(movie_pivot.index == selected_movies)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors = n_neighbors+1)
    
    rec_movies = pd.DataFrame({"title": [movie_pivot.index[i] for i in suggestion[0] if i != movie_id], "cosine_simularity": distance[0][1:]})

    ratings_summary = final_rating.groupby("title")["rating"].agg(["mean", "count"]).reset_index()
    rec_movies = rec_movies.merge(ratings_summary, on="title", how="left")

    x = rec_movies[["cosine_simularity", "mean", "count"]].fillna(0)
    y = rec_movies["mean"]

    model_lr = LinearRegression()
    model_lr.fit(x, y)

    rec_movies["score"] = model_lr.predict(x)
    top_5 = rec_movies.sort_values("score", ascending=False).head(5)

    return top_5[["title", "score"]]


movie_names = "Jumanji (1995)"
print(recommend(movie_names, movie_pivot, model, n_neighbors = 6))

['Jumanji (1995)', 'Jurassic Park (1993)', 'Lion King, The (1994)', 'Mask, The (1994)', 'Men in Black (a.k.a. MIB) (1997)', 'Home Alone (1990)']


In [18]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def recommend_with_regression(selected_movie, movie_pivot, model, final_rating, n_neighbors=200):
    # Hitta liknande filmer med kNN
    if selected_movie not in movie_pivot.index:
        return f"Filmen '{selected_movie}' hittades inte i datasetet."
    
    movie_id = np.where(movie_pivot.index == selected_movie)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=n_neighbors+1)
    
    # Skapa DataFrame med rekommenderade filmer
    recommended_movies = pd.DataFrame({
        "title": [movie_pivot.index[i] for i in suggestion[0] if i != movie_id],
        "cosine_similarity": distance[0][1:]
    })
    
    # Lägg till medelvärde av betyg och antal betyg
    ratings_summary = final_rating.groupby("title")["rating"].agg(["mean", "count"]).reset_index()
    recommended_movies = recommended_movies.merge(ratings_summary, on="title", how="left")
    
    scaler = MinMaxScaler()
    recommended_movies[["mean", "count", "cosine_similarity"]] = scaler.fit_transform(recommended_movies[["mean", "count", "cosine_similarity"]].fillna(0))

    # Träna linjär regression
    X = recommended_movies[["cosine_similarity", "mean", "count"]]
    y = recommended_movies["mean"]  # Använder medelbetyget som målvariabel
    
    model_rf = RandomForestRegressor()
    model_rf.fit(X, y)
    recommended_movies["score"] = model_rf.predict(X)
    top_5_movies = recommended_movies.sort_values("score", ascending=False).head(5)
    
    return top_5_movies[["title", "score"]]

movie_names = "Godfather, The (1972)"
print(recommend_with_regression(movie_names, movie_pivot, model, final_rating, n_neighbors = 200))


                                           title     score
4               Shawshank Redemption, The (1994)  0.970161
122                          12 Angry Men (1957)  0.914997
0                 Godfather: Part II, The (1974)  0.910418
177  Seven Samurai (Shichinin no samurai) (1954)  0.909428
1                            Pulp Fiction (1994)  0.906393


In [19]:
import streamlit as st

def streamlit(movie_names, movie_pivot, model):
    st.title("Movie Recommender by LM")

    selected_movies = st.selectbox("Type or select a movie", movie_names)

    if st.button("Show Recommendeation"):
        movie_list = recommend(selected_movies, movie_pivot, model)
        col1, col2, col3, col4, col5, = st.columns(5)
        
        with col1:
            st.text(movie_list[1])
        with col2:
            st.text(movie_list[2])
        with col3:
            st.text(movie_list[3])
        with col4:
            st.text(movie_list[4])
        with col5:
            st.text(movie_list[5])

In [20]:
def main():
    movie_sparse, movie_pivot, final_rating = extract_features(movies, ratings)
    movie_names, model = make_model(movie_sparse, movie_pivot)
    movie_names = "GoldenEye (1995)"
    print(recommend(movie_names, movie_pivot, model))

    # streamlit(movie_names, movie_pivot, model)

In [21]:
if __name__ == "__main__":
    main()

['GoldenEye (1995)', 'Tomorrow Never Dies (1997)', 'Mission: Impossible (1996)', 'Rock, The (1996)', 'Die Hard: With a Vengeance (1995)', 'True Lies (1994)']
