In [None]:
# ----------- 1. Prepare Problem ---------------
''' the MovieLens dataset for this purpose. It has been collected by the GroupLens Research Project at the University of Minnesota. MovieLens 100K dataset

These files contain 1,000,209 anonymous ratings of approximately 
3,900 movies made by 
6,040 MovieLens users who joined MovieLens in 2000

RATINGS FILE DESCRIPTION  - UserID::MovieID::Rating::Timestamp
USERS FILE DESCRIPTION    - UserID::Gender::Age::Occupation::Zip-code
MOVIES FILE DESCRIPTION   - MovieID::Title::Genres

- Some MovieIDs do not correspond to a movie due to accidental duplicate entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist

The goal of this project is to predict the rating given a user and a movie
'''

In [None]:
# Collaborative filtering systems use the actions of users to recommend other movies.

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:

# b) Load dataset
#Reading Ratings files:
rnames = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_table('ratings.dat', sep = '::', header = None, names = rnames, engine='python')

#Reading users files:
unames = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
users = pd.read_table('users.dat', sep = '::', header = None, names = unames, engine='python')

#Reading movies files:
mnames = ['MovieID', 'Title', 'Genres']
movies = pd.read_table('movies.dat', sep = '::', header = None, names = mnames, engine='python')

In [18]:
#Dimensions of Dataset
# shape
print("ratings :")
print(ratings.shape)
print("users :")
print(users.shape)
print("movies :")
print(movies.shape)

ratings :
(1000209, 4)
users :
(6040, 5)
movies :
(3883, 3)


In [19]:
# 2. Summarize Data
ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [20]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [22]:
ratings['Rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: Rating, dtype: int64

In [23]:
users['Age'].value_counts()

25    2096
35    1193
18    1103
45     550
50     496
56     380
1      222
Name: Age, dtype: int64

In [32]:
dataset = pd.merge(pd.merge(users, ratings), movies)

In [33]:
dataset.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [26]:
dataset.tail(20)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genres
1000189,5532,M,25,17,27408,404,5,959619841,Brother Minister: The Assassination of Malcolm...,Documentary
1000190,5543,M,25,17,97401,404,3,960127592,Brother Minister: The Assassination of Malcolm...,Documentary
1000191,5220,M,25,7,91436,2543,3,961546137,Six Ways to Sunday (1997),Comedy
1000192,5754,F,18,1,60640,2543,4,958272316,Six Ways to Sunday (1997),Comedy
1000193,5227,M,18,10,64050,591,3,961475931,Tough and Deadly (1995),Action|Drama|Thriller
1000194,5795,M,25,1,92688,591,1,958145253,Tough and Deadly (1995),Action|Drama|Thriller
1000195,5313,M,56,0,55406,3656,5,960920392,Lured (1947),Crime
1000196,5328,F,25,4,91740,2438,4,960838075,Outside Ozona (1998),Drama|Thriller
1000197,5334,F,56,13,46140,3323,3,960796159,Chain of Fools (2000),Comedy|Crime
1000198,5334,F,56,13,46140,127,1,960795494,"Silence of the Palace, The (Saimt el Qusur) (1...",Drama


In [27]:
dataset.shape

(1000209, 10)

In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
UserID        1000209 non-null int64
Gender        1000209 non-null object
Age           1000209 non-null int64
Occupation    1000209 non-null int64
Zip-code      1000209 non-null object
MovieID       1000209 non-null int64
Rating        1000209 non-null int64
Timestamp     1000209 non-null int64
Title         1000209 non-null object
Genres        1000209 non-null object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [34]:
dataset[['Age', 'Rating']].groupby(['Age'], as_index=False).mean().sort_values(by='Age', ascending=True)
#(dataset.groupby(['Age', 'Rating'], as_index=False).mean().groupby('Age')['Rating'].mean())

Unnamed: 0,Age,Rating
0,1,3.54952
1,18,3.507573
2,25,3.545235
3,35,3.618162
4,45,3.638062
5,50,3.714512
6,56,3.766632


In [30]:
dataset['Age'] <= 18
# =['1', dataset['Age'] < 18]
#dataset['Age'] < 18

0           True
1          False
2          False
3          False
4          False
           ...  
1000204     True
1000205    False
1000206     True
1000207     True
1000208    False
Name: Age, Length: 1000209, dtype: bool

In [37]:
from scipy.sparse import csr_matrix

# pivot ratings into movie features
movies_features = ratings.pivot(
    index='MovieID',
    columns='UserID',
    values='Rating'
).fillna(0)

# Convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(movies_features.values)

In [38]:
movies_features.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [43]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute', n_neighbors=20, n_jobs=-1))

In [45]:
def make_recommendations(self, fav_movie, n_recommendations):
    '''
    make top n movie recommendations perameters
    '''
    # get data
    movie_user_mat_sparse, hashmap = self._prep_data()
    # get recommendations
    raw_recommends = self._interface(
        self.model, movie_user_mat_sparse, hashmap,
        fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance ')
    )

IndentationError: unexpected indent (<ipython-input-45-8d136c4aa442>, line 12)