# Movielens
Analyzing movies 

### Load data

In [1]:
#http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#http://www.grouplens.org/system/files/ml-1m.zip
import pandas as pd
from io import StringIO 
movies = pd.read_csv('data/movies.dat', delimiter='::', header=False)
ratings = pd.read_csv('data/ratings.dat', delimiter='::', header=False)



In [2]:
movies.columns = ['id', 'name', 'genre']
movies.head()

Unnamed: 0,id,name,genre
0,2,Jumanji (1995),Adventure|Children's|Fantasy
1,3,Grumpier Old Men (1995),Comedy|Romance
2,4,Waiting to Exhale (1995),Comedy|Drama
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller


In [3]:
ratings.columns = ['user', 'item', 'rating', 'timestamp']

In [4]:
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268


In [5]:
len(ratings)

1000208

In [6]:
import requests
import pandas as pd
from io import StringIO  
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file, delimiter='\t')

#print the first row




In [8]:
movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


## user - item matrix

In [9]:
ratings[['user', 'item', 'rating']].head()

Unnamed: 0,user,item,rating
0,1,661,3
1,1,914,3
2,1,3408,4
3,1,2355,5
4,1,1197,3


In [10]:
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix

L = len(ratings)
rows  = ratings['user'][0:L]
cols  = ratings['item'][0:L]
data = ratings['rating'][0:L]

In [11]:
R=coo_matrix((data,(rows,cols)),shape=(max(rows)+1,max(cols)+1))
X=R.tocsr()

In [12]:
# reduce dimensionality
#X = tfidf_matrix
from sklearn.decomposition import TruncatedSVD
X = TruncatedSVD(n_components=100).fit_transform(X)
#X = tfidf_matrix.todense().tolist()

In [13]:
# nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import *
N = 10
knn = NearestNeighbors(N, algorithm='brute', metric='euclidean', leaf_size=1000).fit(X)
distances, indices = knn.kneighbors(X,n_neighbors=N) 

dim = indices.shape
myidx = pd.Series(range(dim[0]*dim[1]))//dim[1]
df_dist = pd.DataFrame(dict(i1=myidx, i2=indices.flatten(), distance=distances.flatten()))
df_dist.head(10)

Unnamed: 0,distance,i1,i2
0,0.0,0,0
1,1.752658,0,4486
2,2.789018,0,3598
3,3.793403,0,4349
4,4.218396,0,4636
5,4.773106,0,1102
6,5.228383,0,4211
7,5.491486,0,4365
8,5.64529,0,4192
9,5.717658,0,6012


In [14]:
df_merge = pd.DataFrame.merge(df_dist, movies, left_on='i1', right_on='id')
df_merge = pd.DataFrame.merge(df_merge, movies, left_on='i2', right_on='id')
df_merge.head(10)
#df_merge = df_merge.query("i1 != i2")
df = df_merge[['id_x', 'title_x', 'id_y', 'title_y', 'distance']]
df.head(10)


Unnamed: 0,id_x,title_x,id_y,title_y,distance
0,1,Toy story,1,Toy story,0.0
1,340,War,1,Toy story,21.104665
2,413,Airheads,1,Toy story,24.162289
3,2766,The Adventures of Sebastian Cole,1,Toy story,23.770819
4,5343,The Temp,1,Toy story,15.722863
5,6006,Just Married,1,Toy story,19.941382
6,1,Toy story,5190,Inside Moves,14.64557
7,119,Steal Big Steal Little,5190,Inside Moves,17.518717
8,184,Nadja,5190,Inside Moves,16.440114
9,490,Malice,5190,Inside Moves,17.594331


## Results

In [15]:
df[df["title_x"] == "Star Wars"]

Unnamed: 0,id_x,title_x,id_y,title_y,distance
7202,260,Star Wars,3968,Bedazzled,21.386946
9479,260,Star Wars,260,Star Wars,0.0
9517,260,Star Wars,5318,Joshua,18.351911
11355,260,Star Wars,93,Vampire in Brooklyn,19.590723
11410,260,Star Wars,1030,Pete's Dragon,21.033806
12020,260,Star Wars,2894,Romance,19.579619
12086,260,Star Wars,5332,The Aviator,19.38068
12096,260,Star Wars,1468,Booty Call,18.505022
22773,260,Star Wars,5227,Barabba,19.965073
22782,260,Star Wars,2983,The Ipcress File,21.101555


In [16]:
df[df["title_x"] == "Star Wars"].sort("distance")

Unnamed: 0,id_x,title_x,id_y,title_y,distance
9479,260,Star Wars,260,Star Wars,0.0
9517,260,Star Wars,5318,Joshua,18.351911
12096,260,Star Wars,1468,Booty Call,18.505022
12086,260,Star Wars,5332,The Aviator,19.38068
12020,260,Star Wars,2894,Romance,19.579619
11355,260,Star Wars,93,Vampire in Brooklyn,19.590723
22773,260,Star Wars,5227,Barabba,19.965073
11410,260,Star Wars,1030,Pete's Dragon,21.033806
22782,260,Star Wars,2983,The Ipcress File,21.101555
7202,260,Star Wars,3968,Bedazzled,21.386946
