# Movielens
Analyzing movies 

### Load data

In [None]:
#http://www.grouplens.org/system/files/ml-1m.zip
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from io import StringIO 
movies = pd.read_csv('data/movies.dat', delimiter='::', names=['id', 'title', 'genre'])
ratings = pd.read_csv('data/ratings.dat', delimiter='::', names = ['user', 'item', 'rating', 'timestamp'])

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
len(ratings)

In [None]:
ratings.describe()

In [None]:
%matplotlib inline
ratings.rating.hist(bins=10);

## user - item matrix

In [None]:
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix

rows  = ratings['user']
cols  = ratings['item']
data = ratings['rating']

In [None]:
R=coo_matrix((data,(rows,cols)),shape=(max(rows)+1,max(cols)+1))
X=R.tocsr()

In [None]:
# reduce dimensionality
from sklearn.decomposition import TruncatedSVD
X = TruncatedSVD(n_components=100).fit_transform(X.T)

## Nearest neighbors

In [None]:
# nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import *
N = 10
knn = NearestNeighbors(N, algorithm='brute', metric='euclidean', leaf_size=1000).fit(X)
distances, indices = knn.kneighbors(X,n_neighbors=N) 

dim = indices.shape
myidx = pd.Series(range(dim[0]*dim[1]))//dim[1]
df_dist = pd.DataFrame(dict(i1=myidx, i2=indices.flatten(), distance=distances.flatten()))
df_dist.head(10)

In [None]:
df_merge = pd.DataFrame.merge(df_dist, movies, left_on='i1', right_on='id')
df_merge = pd.DataFrame.merge(df_merge, movies, left_on='i2', right_on='id')
df_merge.head(10)
#df_merge = df_merge.query("i1 != i2")
df = df_merge[['id_x', 'title_x', 'id_y', 'title_y', 'distance']]

## Results

In [None]:
movies[movies.title.str.contains("Star Wars")]

In [None]:
df[df.id_x == 260].sort("distance")