In [25]:
from functions import connections, initialize

In [26]:
NEO4J_DRIVER = connections.NEO4J_DRIVER
# verify connection
if NEO4J_DRIVER is None:
    print("Error connecting to Neo4j")

In [27]:
initialize.start()

2025-01-23 12:30:55,265 - functions.initialize - INFO - Initializing the application
2025-01-23 12:30:55,287 - functions.initialize - INFO - Loading pre-created embeddings
2025-01-23 12:30:55,654 - functions.data_preprocess - INFO - Embeddings already loaded and indexes created.
2025-01-23 12:30:55,658 - functions.initialize - INFO - Dropping missing data
2025-01-23 12:30:55,659 - functions.data_preprocess - DEBUG - Running query: plot_drop
2025-01-23 12:30:56,416 - functions.data_preprocess - DEBUG - Running query: bio_drop
2025-01-23 12:30:57,198 - functions.data_preprocess - DEBUG - Running query: poster_drop
2025-01-23 12:30:57,389 - functions.data_preprocess - INFO - Nodes with missing or incorrect embeddings dropped successfully.


Schema of the Database:

![schema](imgs/graph.png)

In [28]:
def run_cypher(query, parameters):
    with NEO4J_DRIVER.session() as session:
        result = session.run(query, parameters)
        data = result.data()
        return data

## Getting all Movies

In [29]:
def get_all_nodes_id():
    query = "MATCH (mov:Movie) RETURN elementid(mov) as id"
    result = run_cypher(query, {})
    return [record["id"] for record in result]

movie_ids = get_all_nodes_id()
print(movie_ids[-1])

4:31cc548a-e875-40d2-9ae3-7884192f4683:9144


## Neo4j Movie Recommendations

### Naive Movie Similarity (Basic Graph Traversal)

In [30]:
def similar_movies_genre(id):
    '''
    Finding Movies in the same genre as the given movie, no ranking methods used
    '''
    query = """
    MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)<-[:IN_GENRE]-(rec:Movie)
    WHERE elementId(m) = $id
    WITH rec
    RETURN rec.title AS recommendation
    LIMIT 10
    """
    result = run_cypher(query, {"id": id})
    for record in result:
        print(record["recommendation"])

similar_movies_genre(movie_ids[-1])

Look of Silence, The
Dog, The
1971
Nixon by Nixon: In His Own Words
Life Itself
Finding Vivian Maier
Internet's Own Boy: The Story of Aaron Swartz, The
Battered Bastards of Baseball, The
Jodorowsky's Dune
Mistaken for Strangers


### Using Plot Embeddings (OpenAI text-embedding-ada-002)

In [31]:
def drop_no_plots():
    query = """
    MATCH (m:Movie)
    WHERE m.plot IS NULL
    DETACH DELETE m
    """
    run_cypher(query, {})

In [32]:
def movie_node_proj_graph():
    '''
    Drops existing projection graph if exists, and creates a new graph with Movie nodes and plotEmbedding property
    '''
    drop_query = """
    CALL gds.graph.drop('movieGenreGraph', false) YIELD graphName;
    """
    run_cypher(drop_query, {})
    # query = """
    # CALL gds.graph.project(
    # 'movieGenreGraph',              
    # {
    #   Movie: {
    #     properties: ['plotEmbedding']
    #   },
    #   Genre: {}
    #   },               
    #   {
    #   IN_GENRE: {
    #     type: 'IN_GENRE',
    #     orientation: 'NATURAL'
    #   }
    # }
    # )
    # """
    # result = run_cypher(query, {})
    # return result

In [33]:
movie_node_projection = movie_node_proj_graph()
print(movie_node_projection)

None


# Content Filtering Methods

In [38]:
def plot_embedding_similarity_genre(id):
    '''
    Leveraging plot embeddings to find similar movies in the same genre, ranking using cosine similarity score
    Uses the moviePlots index to find similar movies
    '''
    query = """
    MATCH (source:Movie)
    WHERE elementId(source) = $id
    WITH source, source.plotEmbedding AS sourceVec
    MATCH (target:Movie)-[:IN_GENRE]->(g:Genre)<-[:IN_GENRE]-(source)
    WHERE target.plotEmbedding IS NOT NULL
    WITH source, target, g, gds.similarity.cosine(sourceVec, target.plotEmbedding) AS similarity
    RETURN DISTINCT elementId(source) as source_id, source, elementId(target) as target_id, target, similarity
    ORDER BY similarity DESC
    LIMIT 20
    """
    result = run_cypher(query, {'id': id})
    return result

similar_movies = plot_embedding_similarity_genre('4:31cc548a-e875-40d2-9ae3-7884192f4683:12')
for record in similar_movies:
    # print(record["target_id"], record["target"]["title"], record["similarity"])
    print(record)

{'source_id': '4:31cc548a-e875-40d2-9ae3-7884192f4683:12', 'source': {'languages': ['English', ' Spanish'], 'plotEmbedding': [-0.009269533678889275, -0.015333653427660465, 0.004654391668736935, -0.05583961680531502, -0.02812914177775383, 0.019742732867598534, 0.01308332197368145, -0.012808572500944138, -0.010191908106207848, -0.022045398131012917, 0.015569153241813183, -0.0024351333267986774, -0.006986493710428476, 0.002827632939442992, -0.0048113917000591755, -0.012893613427877426, 0.027605809271335602, -0.002410602057352662, 0.02935897372663021, -0.01721765100955963, -0.025106895714998245, 0.009563907980918884, -0.013802904635667801, 0.01453557051718235, 0.00501745380461216, -0.01692981831729412, 0.007424785289913416, -0.021208064630627632, 0.011945072561502457, -0.003404934424906969, 0.014627153985202312, 0.009544283151626587, -0.009419991634786129, -0.016380319371819496, -0.029908474534749985, -0.00032810517586767673, -0.010878781788051128, -0.008713492192327976, 0.0197950657457113

# Collaborative Filtering Methods

In [35]:
def movie_user_recommendations_singular(id, rating):
    '''
    id: movie node id
    Idea is, if a user has rated a movie highly (5.0), then find similar users who have rated the same movie highly, and recommend movies that they have rated highly
    If a user has rated a movie poorly (0.5), then find similar users who have rated the same movie poorly, and recommend movies that they have rated highly
    '''
    # create a graph projection of Movie and User nodes, and Rated relationships
    # For the rated relationships, we will use the rating as the weight
    # The rating can be accessed by the rating property of the Rated relationship
    # drop_query = """
    # CALL gds.graph.drop('movieUserGraph', false) YIELD graphName;
    # """
    # run_cypher(drop_query, {})
    # query = """
    # CALL gds.graph.project(
    # 'movieUserGraph',              
    # {
    #   Movie: {
    #     properties: ['title']
    #   },
    #   User: {
    #     properties: ['userId']
    #   }
    #   },               
    #   {
    #   RATED: {
    #     type: 'RATED',
    #     orientation: 'NATURAL',
    #     properties: 'rating'
    #   }
    # }
    # )
    # """
    # result = run_cypher(query, {})
    # Find all the users who have rated the movie, and the rating they gave
    # Out of the users who have rated the movie, find the users who have rated the movie similarly to the given rating
    # Find the movies that these users have rated highly in the same genre
    query = """
    MATCH (m:Movie)
    WHERE elementId(m) = $id
    WITH m
    MATCH (m)<-[r:RATED]-(u:User)
    WHERE r.rating = $rating
    WITH u
    MATCH (u)-[r:RATED]->(rec:Movie)
    WHERE r.rating = 5.0
    WITH rec, COLLECT(elementId(u)) AS users
    RETURN DISTINCT elementId(rec) AS rec_id, rec AS recommendation, users
    ORDER BY rec.imdbVotes DESC
    LIMIT 20
    """
    result = run_cypher(query, {'id': id, 'rating': rating})
    return result

movie_user_recommendations = movie_user_recommendations_singular('4:31cc548a-e875-40d2-9ae3-7884192f4683:12', 5.0)
for record in movie_user_recommendations:
    print(record["recommendation"]["title"])

Shawshank Redemption, The
Fight Club
Pulp Fiction
Lord of the Rings: The Fellowship of the Ring, The
Forrest Gump
Matrix, The
Lord of the Rings: The Return of the King, The
Godfather, The
Lord of the Rings: The Two Towers, The
Seven (a.k.a. Se7en)
Gladiator
Star Wars: Episode IV - A New Hope
Silence of the Lambs, The
Saving Private Ryan
Schindler's List
Memento
Star Wars: Episode V - The Empire Strikes Back
American Beauty
Titanic
Godfather: Part II, The


In [36]:
def movie_user_recommendations_count_singular(id, rating)
    query = """
    MATCH (m:Movie)
    WHERE elementId(m) = $id
    WITH m
    MATCH (m)<-[r:RATED]-(u:User)  
    WITH u, m
    MATCH (u)-[r:RATED]->(rec:Movie)  
    WHERE elementID(m) <> elementID(rec)
    WITH rec, COUNT(u) AS user_count
    RETURN elementId(rec) AS rec_id, rec AS recommendation, user_count
    ORDER BY user_count DESC
    LIMIT 20
    """
    result = run_cypher(query, {'id': id, 'rating': rating})
    return result

movie_user_recommendations = movie_user_recommendations_count_singular('4:31cc548a-e875-40d2-9ae3-7884192f4683:12', 5.0)
for record in movie_user_recommendations:
    print(record["recommendation"]["title"])

SyntaxError: expected ':' (1235545919.py, line 1)