In [1]:
import pandas as pd
import numpy as np
from py2neo import Graph
import os

import datetime
import json
from io import BytesIO, StringIO
from csv import writer 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [36]:
graph = Graph("bolt://host.docker.internal:7687", auth=("neo4j", "admin"))

In [3]:
movies_url="http://167.71.3.40/movies_metadata.csv"
md =pd.read_csv(movies_url)

md = md.loc[md['id'].str.isalnum()]
md['id']=md['id'].astype('int')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
link_small_url="http://167.71.3.40/links_small.csv"
link_small= pd.read_csv(link_small_url)
links_small = link_small[link_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
ratings_url="http://167.71.3.40/ratings.csv"
ratings= pd.read_csv(ratings_url)

In [6]:
# talan torolni kell ezt!!!
md = md[md['id'].isin(links_small)]

In [7]:
users_df = pd.DataFrame(ratings['userId'].unique(), columns=['userId'])

In [8]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"]
genres_df = pd.DataFrame(genres, columns=['genres'])

In [9]:
users_movies_df = ratings.drop('timestamp', axis = 1)
users_movies_df['rating']=users_movies_df['rating']*2

In [10]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
md = md.drop(['adult','belongs_to_collection','budget','homepage','original_language','original_title','revenue','runtime','spoken_languages','poster_path','production_companies','release_date','production_countries','video','overview','tagline','popularity'], axis = 1)

In [11]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.6)

md['vote_count'] = md['vote_count'].astype('int')
md['vote_average'] = md['vote_average'].astype('int')

md['wr'] = md.apply(weighted_rating, axis=1)

In [12]:
def get_movie_genres(movieId):
    movie = md[md['id']==movieId]
    tempgenres = [','.join(map(str, l)) for l in movie['genres']]
    df = pd.DataFrame([b for a in [i.split(',') for i in tempgenres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df


In [13]:
# output = StringIO()
# csv_writer = writer(output)
# csv_writer.writerow(['movieId','genres'])
# 
# for x in md['id'].tolist():
#     for row in get_movie_genres(x).iterrows():
#         csv_writer.writerow(row[1])
# 
# output.seek(0) # we need to get back to the start of the BytesIO
# movies_genres = pd.read_csv(output)
# output.flush()
# output.close()

In [14]:
################################   Keywords, Credit ###########################################

keywords_url="http://167.71.3.40/keywords.csv"
keywords= pd.read_csv(keywords_url)


credits_url="http://167.71.3.40/credits.csv"
credits= pd.read_csv(credits_url)

In [15]:

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [16]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [17]:
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)
md['cast_size'] = md['cast'].apply(lambda x: len(x))
md['crew_size'] = md['crew'].apply(lambda x: len(x))

In [18]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [19]:
md['director'] = md['crew'].apply(get_director)


In [20]:
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['cast'] = md['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [21]:
md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
md['director'] = md['director'].apply(lambda x: [x,x, x])

In [22]:
s = md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

s = s.value_counts()
s = s[s > 1]



  """Entry point for launching an IPython kernel.


In [23]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [24]:
md['keywords'] = md['keywords'].apply(filter_keywords)
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [25]:
md['soup'] = md ['keywords']+md['cast'] + md['director'] + md['genres']
md['soup'] = md['soup'].apply(lambda x: ' '.join(x))
#md['soup']= md['soup'].str.decode('iso-8859-1').str.encode('utf8')

In [26]:
tf = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = tf.fit_transform(md['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [27]:
md.set_index(md['id'],inplace=True)
cols = md.index.values
inx = md.index
movies_sim = pd.DataFrame(cosine_sim, columns=cols, index=inx)
movies_sim.head()

Unnamed: 0_level_0,862,8844,15602,31357,11862,949,11860,45325,9091,710,...,373348,338766,390734,314420,390989,159550,392572,402672,315011,391698
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,1.0,0.024419,0.02739,0.025777,0.024419,0.0,0.026547,0.029348,0.0,0.0,...,0.0,0.0,0.023256,0.0,0.020563,0.0305,0.0,0.0,0.0,0.0
8844,0.024419,1.0,0.0,0.0,0.0,0.0,0.0,0.061633,0.027067,0.021592,...,0.0,0.0,0.048839,0.0,0.0,0.0,0.0,0.029735,0.025008,0.0
15602,0.02739,0.0,1.0,0.060718,0.02876,0.0,0.062531,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.024218,0.0,0.034565,0.033352,0.0,0.0
31357,0.025777,0.0,0.060718,1.0,0.027067,0.019263,0.058849,0.03253,0.0,0.0,...,0.0,0.023669,0.025777,0.033806,0.022792,0.033806,0.03253,0.062776,0.026398,0.0
11862,0.024419,0.0,0.02876,0.027067,1.0,0.0,0.027875,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.021592,0.0,0.0,0.0,0.0,0.0


In [28]:
def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='id', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df

In [29]:
# output = StringIO()
# csv_writer = writer(output)
# 
# csv_writer.writerow(['id','sim_movieId','relevance'])
# for x in movies_sim.index.tolist():
#     for row in get_similar(x).iterrows():
#         csv_writer.writerow(row[1])
# 
# output.seek(0) # we need to get back to the start of the BytesIO
# movies_similarity = pd.read_csv(output)
# output.flush()
# output.close()

In [30]:
md = md.drop(['genres','vote_average','vote_count','cast','crew','keywords','cast_size','crew_size','director'], axis = 1)

md.head()

Unnamed: 0_level_0,id,imdb_id,status,title,year,wr,soup
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
862,862,tt0114709,Released,Toy Story,1995,6.969857,jealousy toy boy friendship friends rivalry bo...
8844,8844,tt0113497,Released,Jumanji,1995,5.994978,boardgame disappearance basedonchildren'sbook ...
15602,15602,tt0113228,Released,Grumpier Old Men,1995,5.947792,fishing bestfriend duringcreditsstinger walter...
31357,31357,tt0114885,Released,Waiting to Exhale,1995,5.931771,basedonnovel interracialrelationship singlemot...
11862,11862,tt0113041,Released,Father of the Bride Part II,1995,5.433246,baby midlifecrisis confidence aging daughter m...


In [31]:
def save_to_neo4j(statement,create_param_dict,df):
    tx = graph.begin(autocommit=True)
    params = []
    # dataframe is indexed with numerical indexes
    for index, row in df.iterrows():
        params_dict = create_param_dict(row)
        params.append(params_dict)
        if index % 20000 == 0 and index > 0:
            tx.evaluate(statement, parameters = {"parameters" : params})
            tx = graph.begin(autocommit=True)
            params = []
    print(params)
    tx.evaluate(statement, parameters = {"parameters" : params})
    
    

In [32]:
# statement = """
# UNWIND $parameters as row
# with row where row.id is not null
# MERGE (:Movies {id: row.id, imdb_id : row.imdb_id, status : row.status ,title: row.title});
# """


# def create_movies_param_dict_by_data_type(current_row):
#       return  {
#             'id': current_row['id'],
#             'imdb_id':current_row['imdb_id'],
#             'status':current_row['status'],
#             'title':current_row['title'],
#             'year':current_row['year'],
#             'wr':current_row['wr'],
#             'soup':current_row['soup']
#         }

# save_to_neo4j(statement,create_movies_param_dict_by_data_type,md)

In [37]:
statement = """
UNWIND $parameters as row
MERGE (:Genres {genres: row.genres});
"""


def create_movies_param_dict_by_data_type(current_row):
      return  {
            'genres': current_row['genres']
        }

save_to_neo4j(statement,create_movies_param_dict_by_data_type,genres_df)

[{'genres': 'Action'}, {'genres': 'Adventure'}, {'genres': 'Animation'}, {'genres': 'Children'}, {'genres': 'Comedy'}, {'genres': 'Crime'}, {'genres': 'Documentary'}, {'genres': 'Drama'}, {'genres': 'Fantasy'}, {'genres': 'Film-Noir'}, {'genres': 'Horror'}, {'genres': 'Musical'}, {'genres': 'Mystery'}, {'genres': 'Romance'}, {'genres': 'Sci-Fi'}, {'genres': 'Thriller'}, {'genres': 'War'}, {'genres': 'Western'}, {'genres': '(no genres listed)'}]


AddressError: Cannot resolve address ('neo_db', 7687)

In [34]:
# for index, row in md.iterrows():
#     article = graph.merge_one("Movies", "id", row['id'])
#     article.properties["title"] = row['title']
#     article.push()

In [35]:
#TODO save to DB!
#done genres_df.head()
users_df.head()
users_movies_df.head()
md.head()
movies_genres.head()
movies_similarity.head()





NameError: name 'movies_genres' is not defined