In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")


# Data info

movies.info()
ratings.info()
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328315 entries, 0 to 2328314
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId     int64 
 1   movieId    int64 
 2   tag        object
 3   timestamp  int64 
dtypes: int64(3), object(1)
memory usage: 71.1+ MB


In [13]:
# Vi börjar med att ta bort dubletter på filmer, eftersom det kan finnas samma film i både tex DVD eller VHS och online osv.
unique_movies = movies.drop_duplicates('title') 

In [14]:
# sätter ihop ratings och unique movies
ratings_with_unique_names = ratings.merge(unique_movies, on = 'movieId')

#ta bort timestamp och movieId då den inte behövs samt
ratings_with_unique_names.drop(['timestamp', 'movieId'], axis = 1, inplace = True)

# dvs filmer utan dubletter tillsammans med betyg
ratings_with_unique_names



Unnamed: 0,userId,rating,title,genres
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,4.0,Braveheart (1995),Action|Drama|War
2,1,4.0,Casper (1995),Adventure|Children
3,1,4.5,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,1,5.0,Forrest Gump (1994),Comedy|Drama|Romance|War
...,...,...,...,...
33828462,330975,2.0,Escape from Alcatraz (1979),Drama|Thriller
33828463,330975,2.5,Memphis Belle (1990),Action|Drama|War
33828464,330975,4.0,Fahrenheit 9/11 (2004),Documentary
33828465,330975,3.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller


In [15]:

x = ratings_with_unique_names.groupby('userId').count()['rating'] > 100 #tar ort alla som inte betygsatt fler än 100 böcker, dvs tar fram bara stora användare som betygsatt fler böcker än 100
knowledgeable_users = x[x].index
# tar fram vilka filmer varje user har betygsatt
filtered_user_ratings = ratings_with_unique_names[ratings_with_unique_names['userId'].isin(knowledgeable_users)]


# filtrerar ut filmer som har betygsatt av fler än 50 personer, dvs tar bort filmer som ingen sett eller betygsatt
y = ratings_with_unique_names.groupby('title').count()['rating'] >= 50
famous_movies = y[y].index

# tar fram filmer som har betygsatt av fler än 50 personer och som är betygsatta av de stora användarna
final_user_ratings = ratings_with_unique_names[ratings_with_unique_names['title'].isin(famous_movies)]



In [16]:

# dra ner på filen till 0.001 av datan för att få en mer hanterbar fil
final_user_ratings = final_user_ratings.sample(frac=0.001, random_state=42)

# skapa en pivot table
pivot_table = final_user_ratings.pivot_table(index='title', columns='userId', values='rating')

# Gör alla NaN till 0
pivot_table.fillna(0, inplace= True)
pivot_table



userId,22,35,37,41,53,69,90,95,123,142,...,330855,330863,330884,330911,330912,330914,330929,330961,330967,330970
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC]² (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iBoy (2017),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# StandardScaler, används för att normalisera alla features (tex. längd, ålder) för tex ålder kan bara vara upp till 100 medan längd är mellan 0-200 då tror datorn att längd är viktigare, 
# StandardScaler/ Normalisering gör att alla features utgår från medelvärde på sin feature så att datorn får mer rättvis värde

scaler = StandardScaler(with_mean= True, with_std= True)
# fit skrive ut medelvärdet och standard avvikelsen i vajre kolumn, transform ger den ett värde mellan 0 och 1
pivot_table_normalized = scaler.fit_transform(pivot_table)

    

In [18]:
# similarity matrix for all  movies
similarity_score = cosine_similarity(pivot_table_normalized)

In [19]:
def recommend (movie_name, similarity_score, pivot_table, n_recommendations=5):
    # hämta index för filmen som matchar titeln
    movie_index = pivot_table.index.get_loc(movie_name)

    # hämta alla filmer som liknar den filmen 
    similarity_score_movie = list(enumerate(similarity_score[movie_index]))

    # sortra filmerna baserat på likhetspoäng
    sorted_similar_movies = sorted(similarity_score_movie, key=lambda x: x[1], reverse=True)

    # hämta de mest liknande filmerna
    sorted_similar_movies = sorted_similar_movies[1:n_recommendations+1]

    # hämta index för filmerna
    movie_indices = [i[0] for i in sorted_similar_movies]

    # returnera de mest liknande filmerna
    return pivot_table.index[movie_indices]

In [20]:
recommend('Godfather, The (1972)', similarity_score, pivot_table, n_recommendations=10) 

Index(['Before Midnight (2013)', 'Dial M for Murder (1954)',
       'Shanghai Knights (2003)', 'Misery (1990)', 'Sneakers (1992)',
       'Snatch (2000)', 'V for Vendetta (2006)', 'Shakespeare in Love (1998)',
       'Spider-Man (2002)', 'Blade Runner (1982)'],
      dtype='object', name='title')