## Skapa gles matris

In [2]:
import pandas as pd

with open ('./ml-25m/ratings.csv') as ratings, open('./ml-25m/movies.csv') as movies:
    df_ratings = pd.read_csv(ratings)
    df_movies = pd.read_csv(movies)

df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
df_ratings.drop('timestamp', axis = 1, inplace = True)
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


The main filtering on Dataframe is the removal of users with above 1000 ratings. There were extreme rating count values for some users (30000+) which i feared could be bot action. An average american see an average of 5000 movies during it's lifetime, rating more than a fifth of those under the lifespan of a rating system is in my opinion extreme.

In [4]:
ratings_count = df_ratings.userId.value_counts()
counts_df = ratings_count.rename_axis('userId').reset_index(name='counts') # Get right columns userId and count
counts_df_too_many = counts_df[counts_df['counts'] > 1000] # filter df where count > 1000


print(sum(counts_df_too_many['counts'])) #Removed ratings

counts_df.drop(counts_df[counts_df['userId'].isin(counts_df_too_many['userId'])].index, axis = 0, inplace= True) # Get user ids, Drop users on main dataframe

counts_df


4187357


Unnamed: 0,userId,counts
2670,140790,1000
2671,37097,1000
2672,860,1000
2673,9742,1000
2674,158351,1000
...,...,...
162536,12094,20
162537,119539,20
162538,156759,20
162539,12084,20


Remove users that may be robots or too far away from mean ? 

In [5]:
df_ratings.drop(df_ratings[~df_ratings['userId'].isin(counts_df['userId'])].index, axis = 0, inplace = True) #Drops users with 1000 + reviews (not in counts_df)
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [6]:
#movies_users = df_ratings.pivot(index ="movieId", columns = "userId", values ="rating").fillna(0)
print(df_ratings.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20812738 entries, 0 to 25000094
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 635.2 MB
None


Switching 64 bits encoded values to 32 bits in orders to allow performing a pivot on a local machine without bursting ressources

In [7]:
df_ratings['userId'] = df_ratings['userId'].to_numpy('int32')
df_ratings['movieId'] = df_ratings['movieId'].to_numpy('int32')
df_ratings['rating'] = df_ratings['movieId'].to_numpy('float32')
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20812738 entries, 0 to 25000094
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 397.0 MB


In [8]:
movies_users = df_ratings.pivot(index ="movieId", columns = "userId", values ="rating").fillna(0)




KeyboardInterrupt: 

In [None]:
movies_users

userId,1,2,3,4,5,6,7,8,9,10,...,162532,162533,162534,162535,162536,162537,162538,162539,162540,162541
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
movies_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42685 entries, 1 to 209171
Columns: 159871 entries, 1 to 162541
dtypes: float32(159871)
memory usage: 25.4 GB


In [None]:
from scipy.sparse import csr_matrix

mat_movies_users = csr_matrix(movies_users.values)

In [None]:
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_csr(csr): # Function found on https://datascience.stackexchange.com/questions/80398/setting-sparse-true-in-scikit-learn-onehotencoder-does-not-reduce-memory-usage
    mem = round(csr.data.nbytes * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")
    
print_memory_usage_of_csr(mat_movies_users)

Memory usage is 83.251 MB


In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric ='cosine', algorithm='brute', n_neighbors=20)
model_knn.fit(mat_movies_users)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [None]:
from fuzzywuzzy import process

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data)
    idx = process.extractOne(movie_name, df_movies['title'])[2]
    print(f"Movie Selected : {df_movies['title'][idx]} Index {idx}")
    print('Searching for recommendations......')
    distances, indices= model.kneighbors(data[idx], n_neighbors = n_recommendations)
    for i in indices:
        print(df_movies['title'][i].where(i!=idx))
    print(distances, indices)

recommender('toy story', mat_movies_users, model_knn, 10)

Movie Selected : Toy Story (1995) Index 0
Searching for recommendations......
0                                                     NaN
257             Star Wars: Episode IV - A New Hope (1977)
764                  Independence Day (a.k.a. ID4) (1996)
351                                   Forrest Gump (1994)
1237                            Back to the Future (1985)
475                                  Jurassic Park (1993)
1179    Star Wars: Episode VI - Return of the Jedi (1983)
3020                                   End of Days (1999)
359                                 Lion King, The (1994)
580                                        Aladdin (1992)
Name: title, dtype: object
[[0.         0.42855638 0.43705177 0.4393037  0.45127165 0.45182884
  0.45683002 0.4713617  0.47674632 0.4846387 ]] [[   0  257  764  351 1237  475 1179 3020  359  580]]


Intersting results as the two other movies from the star wars original trilogy are recommended when entering the first.
ABove all I'd say that movies recommended are in nature movies that had been rated many times. I suspect similar vectors to be due to the redundancy of movies seen by many people, and that don't have many "0" or less "0" values in dimesnions.

In [None]:
recommender('Star Wars: Episode IV', mat_movies_users, model_knn, 10)

Movie Selected : Star Wars: Episode IV - A New Hope (1977) Index 257
Searching for recommendations......
257                                                   NaN
1166    Star Wars: Episode V - The Empire Strikes Back...
1179    Star Wars: Episode VI - Return of the Jedi (1983)
1168    Raiders of the Lost Ark (Indiana Jones and the...
2479                           Walk on the Moon, A (1999)
1237                            Back to the Future (1985)
1258            Indiana Jones and the Last Crusade (1989)
4884                     Jimmy Neutron: Boy Genius (2001)
1207                               Terminator, The (1984)
0                                        Toy Story (1995)
Name: title, dtype: object
[[0.0007084  0.204597   0.23366964 0.3169508  0.3539902  0.40254706
  0.40340585 0.42485374 0.42570794 0.42855638]] [[ 257 1166 1179 1168 2479 1237 1258 4884 1207    0]]
