# Movielens
Analyzing movies 

### Load data

In [1]:
#http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#http://www.grouplens.org/system/files/ml-1m.zip
import pandas as pd
from io import StringIO 
movies = pd.read_csv('data/movies.dat', delimiter='::', names=['id', 'title', 'genre'])
ratings = pd.read_csv('data/ratings.dat', delimiter='::', names = ['user', 'item', 'rating', 'timestamp'])



In [2]:
movies.head()

Unnamed: 0,id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
len(ratings)

1000209

## user - item matrix

In [5]:
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix

rows  = ratings['user']
cols  = ratings['item']
data = ratings['rating']

In [6]:
R=coo_matrix((data,(rows,cols)),shape=(max(rows)+1,max(cols)+1))
X=R.tocsr()

In [7]:
# reduce dimensionality
from sklearn.decomposition import TruncatedSVD
X = TruncatedSVD(n_components=100).fit_transform(X.T)

In [8]:
# nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import *
N = 10
knn = NearestNeighbors(N, algorithm='brute', metric='euclidean', leaf_size=1000).fit(X)
distances, indices = knn.kneighbors(X,n_neighbors=N) 

dim = indices.shape
myidx = pd.Series(range(dim[0]*dim[1]))//dim[1]
df_dist = pd.DataFrame(dict(i1=myidx, i2=indices.flatten(), distance=distances.flatten()))
df_dist.head(10)

Unnamed: 0,distance,i1,i2
0,0.0,0,0
1,2.632721e-13,0,1808
2,2.632721e-13,0,1314
3,2.632721e-13,0,1309
4,2.632721e-13,0,1308
5,2.632721e-13,0,1800
6,2.632721e-13,0,2980
7,2.632721e-13,0,1802
8,2.632721e-13,0,1789
9,2.632721e-13,0,1803


In [9]:
df_merge = pd.DataFrame.merge(df_dist, movies, left_on='i1', right_on='id')
df_merge = pd.DataFrame.merge(df_merge, movies, left_on='i2', right_on='id')
df_merge.head(10)
#df_merge = df_merge.query("i1 != i2")
df = df_merge[['id_x', 'title_x', 'id_y', 'title_y', 'distance']]
df.head(10)


Unnamed: 0,id_x,title_x,id_y,title_y,distance
0,1,Toy Story (1995),1,Toy Story (1995),0.0
1,3114,Toy Story 2 (1999),1,Toy Story (1995),96.223973
2,1,Toy Story (1995),3114,Toy Story 2 (1999),96.223973
3,2355,"Bug's Life, A (1998)",3114,Toy Story 2 (1999),98.199961
4,3114,Toy Story 2 (1999),3114,Toy Story 2 (1999),0.0
5,1,Toy Story (1995),588,Aladdin (1992),107.203677
6,34,Babe (1995),588,Aladdin (1992),122.689106
7,364,"Lion King, The (1994)",588,Aladdin (1992),46.539846
8,588,Aladdin (1992),588,Aladdin (1992),0.0
9,595,Beauty and the Beast (1991),588,Aladdin (1992),45.020271


## Results

In [10]:
movies[movies.title.str.contains("Star Wars")]

Unnamed: 0,id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
2559,2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi


In [11]:
df[df.id_x == 260].sort("distance")

Unnamed: 0,id_x,title_x,id_y,title_y,distance
18011,260,Star Wars: Episode IV - A New Hope (1977),260,Star Wars: Episode IV - A New Hope (1977),0.0
18015,260,Star Wars: Episode IV - A New Hope (1977),1196,Star Wars: Episode V - The Empire Strikes Back...,88.397677
18019,260,Star Wars: Episode IV - A New Hope (1977),1210,Star Wars: Episode VI - Return of the Jedi (1983),123.811743
18023,260,Star Wars: Episode IV - A New Hope (1977),1198,Raiders of the Lost Ark (1981),135.430589
18027,260,Star Wars: Episode IV - A New Hope (1977),1214,Alien (1979),142.879717
18033,260,Star Wars: Episode IV - A New Hope (1977),1240,"Terminator, The (1984)",144.48262
18044,260,Star Wars: Episode IV - A New Hope (1977),2628,Star Wars: Episode I - The Phantom Menace (1999),147.738184
18050,260,Star Wars: Episode IV - A New Hope (1977),2571,"Matrix, The (1999)",152.015048
18055,260,Star Wars: Episode IV - A New Hope (1977),1270,Back to the Future (1985),155.930576
18060,260,Star Wars: Episode IV - A New Hope (1977),1200,Aliens (1986),156.563917
