# Creating CREATE File from Data

In [20]:
pip install neo4j

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
import random
import csv

## Queries

In [22]:
from neo4j import GraphDatabase
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [23]:
conn = Neo4jConnection(uri="bolt://graph_db:7687", user="neo4j", pwd="password")

In [24]:
follows_file = open('twitter_combined.txt', 'r')
lines = follows_file.readlines()

follows = []
users = set()
for line in lines:
    line = line.replace('\n','')
    id1, id2 = line.split(' ')
    users.add(id1)
    users.add(id2)
    follows.append([id1, id2])

In [25]:
tweets = pd.read_csv("tweets.csv")
f = open("import/tweetsDB.csv", "w")
writer = csv.writer(f)
writer.writerow(["tweetId:ID","content","date_time","language",":Label"])
count = 1
for index, tweet in tweets.iterrows():
    writer.writerow([count,tweet["content"],tweet["date_time"],tweet["language"],"TWEET"])
    count += 1
f.close()

In [26]:
count = 0
f = open("import/users.csv", "w")
f.write("userId:ID,name,:Label\n")
for user in users:
   f.write(user + ",user" + str(count) + ",USER\n")
   count += 1
f.close()

In [27]:
f = open("import/follows.csv", "w")
f.write(":START_ID,:END_ID,:TYPE\n")
for follow in follows:
    f.write(follow[0] + "," + follow[1] + ",FOLLOWS\n")
f.close()

In [28]:
top100 = conn.query("MATCH (a)-[:FOLLOWS]->(b) RETURN b, COUNT(a) as followers ORDER BY followers DESC LIMIT 100")
user_ids = []
for record in top100:
    user_ids.append(record.get("b").get("userId"))

tweets = pd.read_csv("import/tweetsDB.csv")
f = open("posted.csv", "w")
f.write(":UserID,:TweetID,:TYPE\n")
for index, tweet in tweets.iterrows():
    f.write(f'{random.choice(user_ids)},{tweet["tweetId:ID"]},POSTED\n')
f.close()

In [61]:
conn.query(f"MATCH (n:USER) WITH n LIMIT 50000 DETACH DELETE n")

Query failed: {code: Neo.TransientError.General.MemoryPoolOutOfMemoryError} {message: The allocation of an extra 2.0 MiB would use more than the limit 1.4 GiB. Currently using 1.4 GiB. dbms.memory.transaction.total.max threshold reached}


In [57]:
response = conn.query("MATCH (n:USER) RETURN n LIMIT 1")
print(response)

[]


In [None]:
conn.query("LOAD CSV WITH HEADERS FROM 'file:///users.csv' AS row CREATE (:User {userId: row.userId, name: row.name})")
conn.query("LOAD CSV WITH HEADERS FROM 'file:///tweetsDB.csv' AS row CREATE (:Tweet {tweetId: row.tweetId, content: row.content, date_time: row.date_time, language: row.language})")

In [58]:
response = conn.query("MATCH (n) RETURN DISTINCT labels(n) as labels")
print(response)

[<Record labels=['USER']>]


# 100 Users with most followers

In [14]:
for user in top100:
    print(user['b']['name'], ": ", user['followers'])

user35027 :  8660
user54813 :  7700
user2152 :  7623
user34088 :  7558
user72268 :  4798
user8424 :  4337
user73919 :  3986
user28899 :  3850
user50223 :  3712
user31553 :  3655
user30954 :  3623
user47453 :  3255
user47156 :  3197
user62060 :  3172
user3125 :  2974
user59409 :  2904
user13601 :  2874
user49733 :  2858
user61763 :  2725
user58494 :  2693
user40363 :  2680
user48585 :  2678
user69861 :  2634
user74702 :  2593
user60959 :  2560
user69512 :  2539
user22616 :  2425
user32642 :  2399
user51781 :  2356
user75560 :  2346
user42050 :  2342
user79112 :  2317
user42152 :  2291
user14006 :  2241
user4659 :  2238
user44411 :  2209
user62704 :  2208
user26782 :  2122
user68089 :  2095
user43796 :  2083
user37826 :  2073
user6074 :  2069
user4368 :  2031
user78988 :  2014
user13875 :  1981
user26124 :  1927
user44994 :  1897
user1138 :  1862
user27660 :  1821
user11241 :  1799
user42159 :  1775
user54601 :  1774
user63984 :  1763
user41718 :  1752
user58597 :  1742
user77403 :  1725

In [None]:
tweets_df

# 100 Users who follow the most users

In [None]:
results = conn.query("MATCH (a)-[:FOLLOWS]->(b) RETURN a, COUNT(b) as following ORDER BY following DESC LIMIT 100")

In [None]:
for record in results:
    print(record['following'])