In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [2]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost"
AUTH = ("neo4j", "12345678")

In [3]:
dataset = pd.read_csv('data/imdb_top_5000_tv_shows.csv')
dataset.head()

Unnamed: 0,tconst,primaryTitle,startYear,endYear,rank,averageRating,numVotes,directors,writers,genres,IMDbLink,Title_IMDb_Link
0,tt0903747,Breaking Bad,2008,2013.0,1,9.5,2312557,"Michelle MacLaren, Adam Bernstein, Vince Gilli...","Vince Gilligan, Peter Gould, George Mastras, S...","Crime, Drama, Thriller","<a href=""https://www.imdb.com/title/tt0903747""...","<a href=""https://www.imdb.com/title/tt0903747""..."
1,tt0185906,Band of Brothers,2001,2001.0,2,9.4,559148,"David Frankel, Mikael Salomon, Tom Hanks, Davi...","Stephen Ambrose, Erik Bork, E. Max Frye, Tom H...","Action, Drama, History","<a href=""https://www.imdb.com/title/tt0185906""...","<a href=""https://www.imdb.com/title/tt0185906""..."
2,tt7366338,Chernobyl,2019,2019.0,3,9.3,942269,Johan Renck,Craig Mazin,"Drama, History, Thriller","<a href=""https://www.imdb.com/title/tt7366338""...","<a href=""https://www.imdb.com/title/tt7366338""..."
3,tt0795176,Planet Earth,2006,2006.0,4,9.4,226894,"Alastair Fothergill, Mark Linfield","David Attenborough, Vanessa Berlowitz, Alastai...","Documentary, Family","<a href=""https://www.imdb.com/title/tt0795176""...","<a href=""https://www.imdb.com/title/tt0795176""..."
4,tt5491994,Planet Earth II,2016,2016.0,5,9.4,165965,"Justin Anderson, Ed Charles, Fredi Devas, Chad...",Elizabeth White,Documentary,"<a href=""https://www.imdb.com/title/tt5491994""...","<a href=""https://www.imdb.com/title/tt5491994""..."


Looking at the data we can see 4 different **nodes**: 
- Serie
- Person
- Genre  
  
And also we can see different **reletionships**:
  
- Person -[directed]-> Serie
- Person -[wrote]-> Serie
- serie -[has_genre]-> Genre


In [5]:
template_Person = """CREATE (:Person {name:"%s"});\n"""
template_Serie = """CREATE (:Serie {name:"%s",startYear:"%s",endYear:"%s",avgRating:"%s"});\n"""
template_Genre = """CREATE (:Genre {name:"%s"});\n"""

template_relationship_director = """MATCH (s:Serie),(p:Person) where s.name="%s" and s.startYear="%s" and s.endYear="%s" and p.name="%s" CREATE (p)-[:Directed]->(s);\n"""
template_relationship_writer = """MATCH (s:Serie),(p:Person) where s.name="%s" and s.startYear="%s" and s.endYear="%s" and p.name="%s" CREATE (p)-[:Writer]->(s);\n"""
template_relationship_genre = """MATCH (s:Serie),(g:Genre) where s.name="%s" and s.startYear="%s" and s.endYear="%s" and g.name="%s" CREATE (s)-[:HasGenre]->(g);\n"""

setPeople:set = set()
setGenres:set = set()
setSerie:set = set()

In [6]:
with open('movies.cql','w', encoding='utf-8') as file:
    for i,row in dataset.iterrows():
        serie = row['primaryTitle'].replace('\"','')
        startYear = row['startYear']
        endYear = row['endYear']
        avgRating = row['averageRating']
        directors = row['directors'].split(', ')
        writers = row['writers'].split(', ')
        genres = row['genres'].split(', ')
        
        # Create series node
        if serie not in setSerie:
            createSeries = template_Serie % (serie,startYear,endYear,avgRating)
            setSerie.add(serie)
            file.write(createSeries)

        for director in directors:
            # Create node if it doesn't exist
            if director not in setPeople:
                createPerson = template_Person % director
                setPeople.add(director)
                file.write(createPerson)
            # Create relationship
            createDirectorRelationship = template_relationship_director % (serie,startYear,endYear,director)
            file.write(createDirectorRelationship)


        for writer in writers:
            # Create node if doesn't exist
            if writer not in setPeople:
                createPerson = template_Person % writer
                setPeople.add(writer)
                file.write(createPerson)
            # Create relationship
            createWriterRelationship = template_relationship_writer % (serie,startYear,endYear,writer)
            file.write(createWriterRelationship)
        
        for genre in genres:
            # Create node if it doesn't exist
            if genre not in setPeople:
                createGenre = template_Genre% genre
                setPeople.add(genre)
                file.write(createGenre)
            # Create relationship
            createGenreRelationship = template_relationship_genre % (serie,startYear,endYear,genre)
            file.write(createGenreRelationship)

In [4]:
with open('movies.cql','r',encoding='utf-8') as file:
    text = file.read()
text = text.split('\n')

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    session = driver.session() 
    i = 0
    while i < len(text):
        section = text[i:]
        print(i)

        with session.begin_transaction() as tx:
            for statement in section:
                if statement != '':
                    tx.run(statement)
                i += 1
                if i % 5000 == 0 or i == len(text)-1:
                    tx.commit()
                    break


0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
151975
