In [84]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [85]:
os.chdir(path="/Users/velo1/SynologyDrive/GIT_syno/data/MovieLens _ml-latest-small")
os.getcwd()
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [86]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [87]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [88]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [89]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [90]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [91]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [92]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

In [93]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [94]:
res

(array([[0.42697878, 0.53455477, 0.53455477, 0.53455477, 0.53455477,
         0.53455477, 0.53455477]]),
 array([[6982, 3524, 2453, 3611,  912, 2784, 3819]]))

In [95]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6982,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
3524,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy
2453,3052,Dogma (1999),Adventure|Comedy|Fantasy
3611,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
912,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2784,3489,Hook (1991),Adventure|Comedy|Fantasy
3819,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy


In [96]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [97]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [98]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [99]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,501.0,Pixar,1292956000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,,
4,5,Father of the Bride Part II (1995),Comedy,431.0,steve martin,1140455000.0


In [100]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,501.0,Pixar,1292956000.0


In [101]:
movies_with_tags.tag.unique()

array(['Pixar', nan, 'steve martin', 'biblical', 'crime', 'dark',
       'disturbing', 'greed', 'horror', 'serial killer', 'violent',
       'psychology', 'twist ending', 'thriller', 'adam sandler',
       'cult film', 'Martin Scorsese', 'social commentary', 'Ron Howard',
       'tom hanks', 'Ei muista', 'hugh grant', 'ethan hawke', 'holes90s',
       'aging', 'Ang Lee', 'cooking', 'food', 'relationships', 'meg ryan',
       'school', 'stylized', 'surreal', 'surrealism', 'visceral',
       'cult classic', 'Science Fiction', 'nerdy', 'critically acclaimed',
       'awesome', 'awesome soundtrack', 'jedi', 'space adventure',
       'coming of age', "space epic, science fiction, hero's journey",
       'classic', 'sci-fi', 'supernatural powers', 'George Lucas',
       'starwars', 'space', 'classic sci-fi', 'series',
       'imaginary world, characters, story, philosophical', 'script',
       'action', 'Syfy', 'assassin', 'hit men', 'intense', 'r:violence',
       'tarantino', 'dark comedy'

In [102]:
movies_with_tags.dropna(inplace=True)

In [103]:
movies_with_tags.title.unique().shape

(689,)

In [104]:
tag_strings = []
movies = []

for movie, group in (movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [105]:
tag_strings[:5]

['cute', 'toplist10', 'getdvd', 'getdvd', 'toplist13']

In [106]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [107]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [108]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

In [109]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

366


In [110]:
tag_strings[822]

IndexError: list index out of range

In [None]:
test = change_string('pixar | pixar | fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res

(array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[4178, 4175, 4200, 4195, 4189, 4188, 4187, 4203, 4163, 4162]]))

In [83]:
for i in res[1][0]:
    print(movies[i])

IndexError: list index out of range