## Import needed tools

In [1]:
import bson
from neo4j import GraphDatabase
from datetime import datetime, timedelta
from pprint import pprint
from pymongo import MongoClient

## Define constants

In [2]:
#Define labels for nodes:
l_user = "user"
l_tweet = "tweet"
l_hashtag = "hashtag"
l_url = "url"
#Define relationships:
r_tweeted = "tweeted"
r_retweeted = "retweeted"
r_has_url = "has_url"
r_has_hashtag = "has_hashtag"
r_used_url = "used_url"
r_used_hashtag = "used_hashtag"
r_mentioned = "mentioned"
#Define list of properties for nodes to gather from data to be read:
p_user = ["screen_name","followers_count"]
p_user_key = "screen_name"
p_tweet = ["id","text","favorite_count","retweet_count","reply_count"]
p_tweet_key = "id"

## Some custom tools used in the code

In [3]:

class Node:
    '''
    This class is used to group node data together and define a node object
    '''
    def __init__(self,label,properties,propertiesKey):
        self.label = label
        self.properties = properties
        self.propertiesKey = propertiesKey

class Graph:
    '''
    This class is used to start a connection with neo4j and create nodes and relationships
    '''
    def __init__(self,uri,username,password):
        self.graph = GraphDatabase.driver(uri, auth=(username, password))
        self.session = self.graph.session()
    def execute(self,cypher):
        '''
        Execute cypher script
        '''
        return self.session.run(cypher)
    def find (self,label,propertiesKey, propertiesKeyValue) -> Node:
        '''
        Finds an existing node using the node key (we assume single property keys and expect at most one result)
        '''
        c_node = 'n' #Node variable used in queries
        match = self.session.run(f"match ({c_node}:{label}) where {c_node}.{propertiesKey} = '{propertiesKeyValue}' return {c_node}").data()
        if len(match) > 0:
            return Node(label,match[0][c_node],propertiesKey)
        else:
            return None
    def upsertNode(self, node:Node, allowUpdate = True):
        '''
        Update or insert a node. If not already existing it inserts a new node else it updates the existing one.
        We assume that we has a single property as a key (a key is a way to uniquely identify the node).
        The value of the properties key is always stored as string
        '''
        c_node = 'n' #Node variable used in queries
        #Check for existing node :
        match = self.session.run(f"match ({c_node}:{node.label}) where {c_node}.{node.propertiesKey} = '{node.properties[node.propertiesKey]}' return {c_node}").data()
        updateNode = False #<= This flag is set to true only when there is an existing node with different data that those given
        #Convert given properties to a string to be used along with the query :
        propertiesString = ""
        for key in node.properties:
            value = node.properties[key]
            if type(value) == str: value = value.replace("\\","\\\\").replace("'","\\'")
            if len(match) > 0:
                if key not in match[0][c_node]: updateNode = True
                elif value != match[0][c_node][key]: updateNode = True
                propertiesString += " " + c_node +"." + key + "=" + ((str(value))if type(value) != str and key != node.propertiesKey else "'" + str(value) + "'" ) + ","
            else:
                propertiesString += key + ":" + ((str(value))if type(value) != str and key != node.propertiesKey else "'" + str(value) + "'" ) + ","
        propertiesString = propertiesString[:-1]
        #Execute query :
        if len(match) <= 0:         #Create new node
            propertiesString = "{" + propertiesString + "}"
            self.session.run(f"create ({c_node}:{node.label} {propertiesString})")
        elif updateNode == True and allowUpdate == True:    #Update existing node if there is any changed value
            self.session.run(f"match ({c_node}:{node.label}) where {c_node}.{node.propertiesKey} = '{node.properties[node.propertiesKey]}' SET {propertiesString}");

    def upsertRelationship(self,node1:Node,relationship:str,node2:Node,properties, allowUpdate = True):
        '''
        Update or insert a relationship. If not already existing it creates a new relationship else it updates the existing one
        We assume that we has a single property as a key (a key is a way to uniquely identify the node).
        The value of the properties key is always stored as string
        '''
        c_relationship = 'r'
        matchQuery = f"MATCH (a:{node1.label})-[{c_relationship}:{relationship}]->(b:{node2.label}) WHERE a.{node1.propertiesKey} = '{node1.properties[node1.propertiesKey]}' AND b.{node2.propertiesKey} = '{node2.properties[node2.propertiesKey]}'"
        #Check for existing relationship :
        match = self.session.run(f"{matchQuery} return {c_relationship}").value()
        updateRelationship = False #<= This flag is set to true only when there is an existing node with different data that those given
        #Convert given properties to a string to be used along with the query :
        propertiesString = ""
        for key in properties:
            value = properties[key]
            if type(value) == str: value = value.replace("\\","\\\\").replace("'","\\'")
            if len(match) > 0:
                if key not in match[0]._properties: updateRelationship = True
                elif value != match[0]._properties[key]: updateRelationship = True
                propertiesString += " " + c_relationship +"." + key + "=" + ((str(value))if type(value) != str else "'" + value + "'" ) + ","
            else:
                propertiesString += key + ":" + ((str(value))if type(value) != str else "'" + value + "'" ) + ","
        propertiesString = propertiesString[:-1]
        if len(match) <= 0:         #Create new node
            propertiesString = "{" + propertiesString + "}"
            matchQuery = f"MATCH (a:{node1.label}), (b:{node2.label}) WHERE a.{node1.propertiesKey} = '{node1.properties[node1.propertiesKey]}' AND b.{node2.propertiesKey} = '{node2.properties[node2.propertiesKey]}'"
            query = f"{matchQuery} CREATE (a)-[{c_relationship}:{relationship} {propertiesString}]->(b)"
            self.session.run(query)
        elif updateRelationship == True and allowUpdate == True:    #Update existing node if there is any changed value
            matchQuery = f"MATCH (a:{node1.label}), (b:{node2.label}) WHERE a.{node1.propertiesKey} = '{node1.properties[node1.propertiesKey]}' AND b.{node2.propertiesKey} = '{node2.properties[node2.propertiesKey]}'"
            query = f"MATCH (a:{node1.label})-[{c_relationship}:{relationship}]->(b:{node2.label}) WHERE a.{node1.propertiesKey} = '{node1.properties[node1.propertiesKey]}' AND b.{node2.propertiesKey} = '{node2.properties[node2.propertiesKey]}'"
            self.session.run(f"{query} SET {propertiesString}")

## Open connection with Neo4J DBMS

In [4]:
graph = Graph("bolt://localhost:7687","neo4j","1234")

## Create Graph

#### Read Data From BSON File

In [5]:
bson_file = open('data\citizenScience.bson', 'rb')
data = bson.decode_all(bson_file.read())
bson_file.close()

#### Read data From MongoDB Database - Alternative

In [44]:
client = MongoClient(port=27017)
data=[item for item in client.data.citizenScience.find()]
client.close()
#This can be improved by quering from MongoDB only the needed data but further time would be required to learn the quering language of MongoDB
#The implemented solution is working independently from the data gathering method and we consider it to be sufficient for the amount of data we need to handle

#### Create Graph from loaded data

In [30]:
#A function to gather the device from the data:
def getDevice(tweetItem):
    if "source" in tweetItem:
        if ">" in tweetItem["source"] and "<" in tweetItem["source"]:
            tweetDevice = (tweetItem["source"].split(">")[1]).split("<")[0]
        else:
            tweetDevice = tweetItem["source"]
    return tweetDevice

#Sort data from oldest to latest:
data = sorted(data,key=lambda x: datetime.strptime(x["created_at"],"%a %b %d %H:%M:%S %z %Y"), reverse=False)

for item in data:
    # Create User node:
    user = Node(l_user,{p : (item["user"][p] if p in item["user"] else None) for p in p_user},p_user_key)
    graph.upsertNode(user)
    retweet = False
    originalTweetUser = None #<= In case of retweet, who created the tweet originally
    if "retweeted_status" in item:
        #This is a retweet
        tweetItem = item["retweeted_status"]
        originalTweetUser = Node(l_user,{p : (tweetItem["user"][p] if p in tweetItem["user"] else None) for p in p_user}, p_user_key)
        graph.upsertNode(originalTweetUser)
        retweet = True
        retweetDevice = getDevice(item)
        #Get retweet timestamp
        if "created_at" in item:
            date = datetime.strptime(item["created_at"],"%a %b %d %H:%M:%S %z %Y")
            retweetTimestamp = date.strftime("%Y%m%d%H%M%S")
    else:
        #This is a tweet
        tweetItem = item

    #Get the tweet device :
    tweetDevice = getDevice(tweetItem)
    if "created_at" in tweetItem:
        date = datetime.strptime(tweetItem["created_at"],"%a %b %d %H:%M:%S %z %Y")
        tweetTimestamp = date.strftime("%Y%m%d%H%M%S")

    # Create tweet node:
    tweet = Node(l_tweet,{p: (tweetItem[p] if p in tweetItem else None) for p in p_tweet},p_tweet_key)
    graph.upsertNode(tweet)

    #Create user tweet relationship :
    graph.upsertRelationship(user,r_retweeted if retweet else r_tweeted,tweet,{'timestamp':retweetTimestamp if retweet else tweetTimestamp, 'source': retweetDevice  if retweet else tweetDevice})
    if originalTweetUser != None:
        graph.upsertRelationship(originalTweetUser,r_tweeted,tweet,{'timestamp':tweetTimestamp, 'source':tweetDevice})

    #Get tweet entities:
    tweetEntities = []
    hasExtendedEntity = False #Determines if we do have an extended entity in our tweet
    #Get data from entities on root
    if "entities" in tweetItem:
        tweetEntities.append([tweetItem["entities"],"root"])
    #Get data from entities in extended tweet
    if "extended_tweet" in tweetItem:
        if "entities" in tweetItem["extended_tweet"]:
            tweetEntities.append([tweetItem["extended_tweet"]["entities"],"extended_tweet"])
            hasExtendedEntity = True
    for item in tweetEntities:
        entity = item[0]
        if "hashtags" in entity:
            for hashtag in entity["hashtags"]:
                if not "text" in hashtag: continue
                hashtagNode = Node(l_hashtag,{"text":hashtag['text'].lower()},"text")
                graph.upsertNode(hashtagNode)
                graph.upsertRelationship(user,r_used_hashtag,hashtagNode,{'timestamp':tweetTimestamp})
                graph.upsertRelationship(tweet,r_has_hashtag,hashtagNode,{'timestamp':tweetTimestamp})
        if "urls" in entity:
            for url in entity["urls"]:
                if (hasExtendedEntity) and item[1] == "root": break #In case we have an extended tweet don't get the URLs from root only from extended tweet
                if not "expanded_url" in url: continue
                urlNode = Node(l_url,{"url":url["expanded_url"]},'url')
                graph.upsertNode(urlNode)
                graph.upsertRelationship(user,r_used_url,urlNode,{'timestamp':tweetTimestamp})
                graph.upsertRelationship(tweet,r_has_url,urlNode,{'timestamp':tweetTimestamp})
        if "user_mentions" in entity:
            for usr in entity["user_mentions"]:
                if not p_user_key in usr: continue
                existingUser = graph.find(l_user,p_user_key,usr[p_user_key])
                if existingUser == None:
                    existingUser = Node(l_user,{p : (usr[p] if p in usr else 0) for p in p_user},p_user_key)
                    graph.upsertNode(existingUser,allowUpdate=False)
                graph.upsertRelationship(user,r_mentioned,existingUser,{'timestamp':tweetTimestamp},allowUpdate=False)

## Questions & Answers

#### 1. Get the total number of tweets

In [152]:
print(graph.execute(f"match (tweets:{l_tweet}) return count(tweets) as Number_of_tweets").data())

[{'Number_of_tweets': 9409}]


#### 2. Get the total number of retweets

In [153]:
print(graph.execute(f"match (:{l_user})-[retweets:{r_retweeted}]->(:{l_tweet}) return count(retweets) as Number_of_retweets").data())

[{'Number_of_retweets': 20745}]


#### 3. Get the total number of hashtags (case insensitive)

In [154]:
print(graph.execute(f"match (hashtags:{l_hashtag}) return count(toLower(hashtags.text)) as Number_of_hashtags").data())

[{'Number_of_hashtags': 5324}]


#### 4. Get the 20 most popular hashtags (case insensitive) in descending order

In [155]:
pprint(graph.execute(f"match (hashtags:{l_hashtag}) return distinct toLower(hashtags.text) as hashtag, size((:{l_user})-[:{r_used_hashtag}]->(hashtags)) as indegree order by indegree descending limit 20").data())

[{'hashtag': 'openscience', 'indegree': 4492},
 {'hashtag': 'citizenscience', 'indegree': 2506},
 {'hashtag': 'openaccess', 'indegree': 641},
 {'hashtag': 'scicomm', 'indegree': 562},
 {'hashtag': 'opendata', 'indegree': 433},
 {'hashtag': 'publicengagement', 'indegree': 427},
 {'hashtag': 'crowdsourcing', 'indegree': 385},
 {'hashtag': 'opensource', 'indegree': 369},
 {'hashtag': 'datascience', 'indegree': 299},
 {'hashtag': 'bioinformatics', 'indegree': 267},
 {'hashtag': 'sb19', 'indegree': 257},
 {'hashtag': 'stanworld', 'indegree': 255},
 {'hashtag': 'westanpeace', 'indegree': 255},
 {'hashtag': 'research', 'indegree': 255},
 {'hashtag': 'westanlove', 'indegree': 255},
 {'hashtag': 'citsci', 'indegree': 253},
 {'hashtag': 'ukraine', 'indegree': 245},
 {'hashtag': 'machinelearning', 'indegree': 234},
 {'hashtag': 'covid19', 'indegree': 231},
 {'hashtag': 'biodiversity', 'indegree': 222}]


#### 5. Get the total number of URLs

In [23]:
print(graph.execute(f"match (urls:{l_url}) return count(urls) as Number_Of_URLs").data())

[{'Number_Of_URLs': 3915}]


#### 6. Get the 20 most popular URLs in descending order

In [24]:
pprint(graph.execute(f"match (urls:{l_url}) return urls.url as url, size((:{l_user})-[:{r_used_url}]->(urls)) as indegree order by indegree descending limit 20").data())

[{'indegree': 433, 'url': 'http://apne.ws/CCCiUpB'},
 {'indegree': 212, 'url': 'http://www.nairaworkers.com'},
 {'indegree': 163, 'url': 'https://www.cisa.gov/shields-up'},
 {'indegree': 105, 'url': 'https://osf.io/preprints/metaarxiv/zry2u'},
 {'indegree': 95,
  'url': 'https://plantfunctionaltraitscourses.w.uib.no/pftc6-norway-sign-up-now/'},
 {'indegree': 75, 'url': 'https://www.nature.com/articles/d41586-022-00402-1'},
 {'indegree': 72,
  'url': 'https://ceh-online-surveys.onlinesurveys.ac.uk/pollinator-citizen-science-across-europe'},
 {'indegree': 64,
  'url': 'https://employment.ku.edu/postdoctoral-researcher/21278br'},
 {'indegree': 64,
  'url': 'https://apnews.com/article/f2c4960e48b8022a567780f3602b54e2'},
 {'indegree': 60,
  'url': 'https://www.atlanticcouncil.org/blogs/ukrainealert/new-crowdsourcing-campaign-can-help-save-ukraine/'},
 {'indegree': 58,
  'url': 'https://www.humboldt-foundation.de/fileadmin/Bewerben/Programme/Philipp-Schwartz-Initiative/PSI_Special_provisions

#### 7. Get the followers count of each user

In [158]:
pprint(graph.execute(f"match (users:{l_user}) return sum(users.{p_user[1]}) as Total_followers_count").data())
pprint(graph.execute(f"match (users:{l_user}) return users.{p_user[0]} as User, users.{p_user[1]} as followers").data())

[{'Total_followers_count': 118603522}]
[{'User': 'AndGenomics', 'followers': 2369},
 {'User': '_lewtun', 'followers': 2195},
 {'User': 'Raamana_', 'followers': 2997},
 {'User': 'JayHeltzer', 'followers': 1305},
 {'User': 'Canada_CEBCEM', 'followers': 456},
 {'User': 'OutTeachEd', 'followers': 2222},
 {'User': 'zehavoc', 'followers': 4667},
 {'User': 'EcoPol_Arg', 'followers': 642},
 {'User': 'AvilaLovera', 'followers': 266},
 {'User': 'seed_ball', 'followers': 52582},
 {'User': 'bigmeadowsearch', 'followers': 465},
 {'User': 'LicenceProDist', 'followers': 205},
 {'User': 'TheJulieBenson', 'followers': 18156},
 {'User': 'MyriamHirt', 'followers': 60},
 {'User': 'jenjohnston74', 'followers': 32},
 {'User': 'zaibatsu', 'followers': 0},
 {'User': 'CondeNast', 'followers': 0},
 {'User': 'WIRED', 'followers': 0},
 {'User': 'ArtsJournalNews', 'followers': 0},
 {'User': 'harrisonstephen', 'followers': 0},
 {'User': 'alanmillerNLP', 'followers': 0},
 {'User': 'MattNavarra', 'followers': 114127}

#### 8. Get the 20 users with most followers in descending order

In [29]:
query = f"match (users:{l_user}) return users.{p_user[0]} as user, users.{p_user[1]} as followers order by followers descending limit 20"
pprint(graph.execute(query).data())

[{'followers': 17519323, 'user': 'Forbes'},
 {'followers': 15630637, 'user': 'AP'},
 {'followers': 4897983, 'user': 'coinbase'},
 {'followers': 4286962, 'user': 'marcorubio'},
 {'followers': 3048702, 'user': 'NWS'},
 {'followers': 2940260, 'user': 'verge'},
 {'followers': 1882525, 'user': 'britishlibrary'},
 {'followers': 1751550, 'user': 'BoredElonMusk'},
 {'followers': 1333989, 'user': 'TheRickWilson'},
 {'followers': 1262006, 'user': 'zeerajasthan_'},
 {'followers': 1227496, 'user': 'NSF'},
 {'followers': 1027275, 'user': 'deray'},
 {'followers': 893280, 'user': 'thidakarn'},
 {'followers': 802165, 'user': 'NASAGoddard'},
 {'followers': 784534, 'user': 'campbellclaret'},
 {'followers': 763029, 'user': 'NASAJuno'},
 {'followers': 744498, 'user': 'NASASun'},
 {'followers': 721944, 'user': 'UNESCOarabic'},
 {'followers': 643790, 'user': 'nrc'},
 {'followers': 636812, 'user': 'BMWUSA'}]


#### 9. Get the number of tweets & retweets per hour

In [160]:
#A function used to convert the output duration from the cypher query into hours:
def duration_in_hours(duration):
    # Unit-based form: P[nY][nM][nW][nD][T[nH][nM][nS]]   (P is prefix for duration)
    years=months=weeks=days=hours=minutes=seconds = 0
    duration=str(duration)
    if "T" in duration:
        x,y = duration.split("T",1)
    else:
        x = str(duration)
    _,x = x.split("P",1)
    if 'Y' in x:
        years,x = x.split("Y",1)
    if 'M' in x:
        months,x = x.split("M",1)
    if 'W' in x:
        weeks,x = x.split("W",1)
    if 'D' in x:
        days,x = x.split("D",1)

    if y:
        if 'H' in y:
            hours,y = (y.split("H",1))
        if 'M' in y:
            minutes,y = (y.split("M",1))
        if 'S' in y:
            seconds,y = (y.split("S",1))
    if int(minutes) > 30:
        return (int(years)*8760) + (int(months)*730) + (int(weeks)*168) + (int(days)*24)  + int(hours) + 1
    else:
        return (int(years)*8760) + (int(months)*730) + (int(weeks)*168) + (int(days)*24) + int(hours)

#Calculate tweets per hour :
tweet_result = graph.execute(f"MATCH ()-[r:{r_tweeted}]->() RETURN distinct count(r) as tweets").data()
for row in tweet_result:
    tweets = row["tweets"]
tweet_result = graph.execute(f"MATCH (:{l_user})-[r:{r_tweeted}]->() RETURN min(toInteger(r.timestamp)) as min, max(toInteger(r.timestamp)) as max").data()
for row in tweet_result:
    min_date = datetime.strptime(str(row["min"]),"%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%S.000")
    max_date = datetime.strptime(str(row["max"]),"%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%S.000")
tweet_result = graph.execute(f"UNWIND [ duration.inSeconds(localdatetime('{min_date}'), localdatetime('{max_date}')) ] AS aDuration RETURN aDuration")
for row in tweet_result:
    duration=row["aDuration"]
hours = duration_in_hours(duration)
tweets_per_hour = tweets / hours
print("Tweets per hour: ", tweets_per_hour)

#Calculate retweets per hour :
retweet_result = graph.execute(f"MATCH ()-[r:{r_retweeted}]->() RETURN distinct count(r) as retweets").data()
for row in retweet_result:
    retweets = row["retweets"]
retweet_result = graph.execute(f"MATCH (:{l_user})-[r:{r_retweeted}]->(:{l_tweet}) RETURN min(toInteger(r.timestamp)) as min, max(toInteger(r.timestamp)) as max").data()
for row in retweet_result:
    min_date = datetime.strptime(str(row["min"]),"%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%S.000")
    max_date = datetime.strptime(str(row["max"]),"%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%S.000")
retweet_result = graph.execute(f"UNWIND [ duration.inSeconds(localdatetime('{min_date}'), localdatetime('{max_date}')) ] AS aDuration RETURN aDuration")
for row in retweet_result:
    duration=row["aDuration"]
hours = duration_in_hours(duration)
retweets_per_hour = retweets / hours
print("Retweets per hour: ", retweets_per_hour)

Tweets per hour:  0.23174306051575083
Retweets per hour:  58.436619718309856


#### 10. Get the hour with the most tweets and retweets

In [161]:
min = datetime.strptime("000000","%H%M%S")
max = datetime.strptime("235959","%H%M%S")
results = []
while min <= max:
    prevHour = min.strftime("%H0000")
    min += timedelta(hours=1) #Next hour
    currentHour = min.strftime("%H0000")
    results.append({
        'hour':prevHour[:2] + "-" + currentHour[:2],
        'tweets':graph.execute(f"match (:user)-[relationships]->(:tweet) where substring(relationships.timestamp,8,6) >= '{prevHour}' and substring(relationships.timestamp,8,6) <= '{currentHour}' return count(relationships) as c").data()[0]["c"]
        })
pprint(results)
popularHour = 0
mostTweetsPerHour = 0
for item in results:
    if mostTweetsPerHour < item["tweets"]:
        mostTweetsPerHour = item["tweets"]
        popularHour = item["hour"]
print(f"\nHour with most tweets & retweets : {popularHour}")

[{'hour': '00-01', 'tweets': 765},
 {'hour': '01-02', 'tweets': 749},
 {'hour': '02-03', 'tweets': 807},
 {'hour': '03-04', 'tweets': 809},
 {'hour': '04-05', 'tweets': 1005},
 {'hour': '05-06', 'tweets': 828},
 {'hour': '06-07', 'tweets': 880},
 {'hour': '07-08', 'tweets': 1151},
 {'hour': '08-09', 'tweets': 1441},
 {'hour': '09-10', 'tweets': 1632},
 {'hour': '10-11', 'tweets': 1531},
 {'hour': '11-12', 'tweets': 1455},
 {'hour': '12-13', 'tweets': 1422},
 {'hour': '13-14', 'tweets': 1520},
 {'hour': '14-15', 'tweets': 1710},
 {'hour': '15-16', 'tweets': 1669},
 {'hour': '16-17', 'tweets': 1777},
 {'hour': '17-18', 'tweets': 1755},
 {'hour': '18-19', 'tweets': 1478},
 {'hour': '19-20', 'tweets': 1445},
 {'hour': '20-21', 'tweets': 1318},
 {'hour': '21-22', 'tweets': 1076},
 {'hour': '22-23', 'tweets': 1098},
 {'hour': '23-00', 'tweets': 0}]

Hour with most tweets & retweets : 16-17


#### 11. Get the device that most users are tweeting from (top 5 devices)

In [162]:
query = f"match (:{l_user})-[tweeted]->(:{l_tweet}) return distinct tweeted.source as device, count(tweeted.source) as times_used order by times_used descending limit 5"
pprint(graph.execute(query).data())

[{'device': 'Twitter Web App', 'times_used': 9508},
 {'device': 'Twitter for Android', 'times_used': 5975},
 {'device': 'Twitter for iPhone', 'times_used': 5962},
 {'device': 'OpenSciTalk', 'times_used': 1852},
 {'device': 'TweetDeck', 'times_used': 841}]


#### 12. Get the users, in descending order, that have been mentioned the most

In [7]:
results = graph.execute(f"MATCH ()-[r:{r_mentioned}]->(u:{l_user})  RETURN u.screen_name as user, count(r) as mentions ORDER BY count(r) DESC limit 20").data()
pprint(results)

[{'mentions': 257, 'user': 'SB19Official'},
 {'mentions': 217, 'user': 'TheRickWilson'},
 {'mentions': 116, 'user': 'doctorow'},
 {'mentions': 116, 'user': 'zittrain'},
 {'mentions': 110, 'user': 'CitSciOZ'},
 {'mentions': 96, 'user': 'PFTCourses'},
 {'mentions': 95, 'user': 'raspishake'},
 {'mentions': 79, 'user': 'NIH'},
 {'mentions': 75, 'user': 'FORRTproject'},
 {'mentions': 70, 'user': 'inaturalist'},
 {'mentions': 68, 'user': 'OPERASEU'},
 {'mentions': 68, 'user': 'Flavio_Azevedo_'},
 {'mentions': 67, 'user': 'cOAlitionS_OA'},
 {'mentions': 64, 'user': 'AGUecohydro'},
 {'mentions': 64, 'user': 'LandonMarston'},
 {'mentions': 61, 'user': 'UniLeipzig'},
 {'mentions': 61, 'user': 'SueReviews'},
 {'mentions': 61, 'user': 'AutismINSAR'},
 {'mentions': 59, 'user': 'AvHStiftung'},
 {'mentions': 58, 'user': 'ScienceEurope'}]


#### 13. Get the most active users (users that have posted most tweets)

In [165]:
results = graph.execute(f"MATCH (u:{l_user})-[r:{r_tweeted}]-() RETURN count(r) as tweets , u.screen_name as user order by count(r) DESC limit 20").data()
pprint(results)

[{'tweets': 698, 'user': 'Aalst_Waalre'},
 {'tweets': 101, 'user': 'RobotRrid'},
 {'tweets': 98, 'user': 'OpenSci_News'},
 {'tweets': 90, 'user': 'raspishakEQ'},
 {'tweets': 66, 'user': 'Primary_Immune'},
 {'tweets': 65, 'user': 'DG_Rand'},
 {'tweets': 42, 'user': 'moneynetlink'},
 {'tweets': 38, 'user': 'DocCrenau'},
 {'tweets': 33, 'user': 'egonwillighagen'},
 {'tweets': 32, 'user': 'AlanSheehan18'},
 {'tweets': 32, 'user': 'citizenskies'},
 {'tweets': 30, 'user': 'HeidiProject'},
 {'tweets': 30, 'user': 'MDDelahunty'},
 {'tweets': 28, 'user': 'for_designer'},
 {'tweets': 28, 'user': 'CreativeSage'},
 {'tweets': 22, 'user': 'Treadstone71LLC'},
 {'tweets': 22, 'user': 'pivottwistdev'},
 {'tweets': 22, 'user': 'CitieSHealthEU'},
 {'tweets': 21, 'user': 'BitcoinORama'},
 {'tweets': 21, 'user': 'fdmhildesheim'}]


#### 14. Get the top 20 tweets that has been retweeted the most and the persons that posted them

In [5]:
query = f"match (tweet:{l_tweet})<-[:{r_tweeted}]-(users:{l_user}) return distinct tweet.id as tweet_id, users.{p_user[0]} as user,size((tweet)<-[:{r_retweeted}]-()) as times_retweeted order by times_retweeted descending limit 20"
pprint(graph.execute(query).data())

[{'times_retweeted': 934,
  'tweet_id': '1499064794829131779',
  'user': 'PigsAndPlans'},
 {'times_retweeted': 428, 'tweet_id': '1499959375536001025', 'user': 'AP'},
 {'times_retweeted': 252,
  'tweet_id': '1500684178114760706',
  'user': 'acetwtts'},
 {'times_retweeted': 243,
  'tweet_id': '1499579773382967296',
  'user': 'commissionsbyk'},
 {'times_retweeted': 212,
  'tweet_id': '1307603562206244864',
  'user': 'nairaworkers'},
 {'times_retweeted': 203,
  'tweet_id': '1497580495533711362',
  'user': 'paulUKcoder'},
 {'times_retweeted': 200,
  'tweet_id': '1303357058134278146',
  'user': 'nairaworkers'},
 {'times_retweeted': 191,
  'tweet_id': '1497647768109830146',
  'user': 'JasonPLowery'},
 {'times_retweeted': 162,
  'tweet_id': '1498839559290994695',
  'user': 'marcorubio'},
 {'times_retweeted': 118,
  'tweet_id': '1499421562117709834',
  'user': 'mcclure111'},
 {'times_retweeted': 113,
  'tweet_id': '1497375254313791491',
  'user': 'fredbenenson'},
 {'times_retweeted': 105, 'twee

#### 15.Get the top-20 hashtags that co-occur with the hashtag that has been used the most

In [167]:
most_popular = graph.execute(f"MATCH ()-[r:{r_used_hashtag}]->(h:{l_hashtag}) RETURN h.text AS most_popular, count(r) order by count(r) DESC LIMIT 1").data()[0]['most_popular']
results = graph.execute(f'''MATCH (t:{l_tweet})-[r:{r_has_hashtag}]->(h:{l_hashtag})
                            WHERE EXISTS{{
                                MATCH (t)-[r2:{r_has_hashtag}]->(h2:{l_hashtag})
                                WHERE h2.text = '{most_popular}' }}
                            AND h.text <> '{most_popular}'
                            RETURN h.text as hashtag, count(r) as count
                            order by count desc LIMIT 20''').data()
print(f"Most used hashtag : {most_popular}")
pprint(results)

Most used hashtag : openscience
[{'count': 305, 'hashtag': 'openaccess'},
 {'count': 145, 'hashtag': 'opendata'},
 {'count': 137, 'hashtag': 'scicomm'},
 {'count': 101, 'hashtag': 'opensource'},
 {'count': 96, 'hashtag': 'datascience'},
 {'count': 93, 'hashtag': 'bigdata'},
 {'count': 79, 'hashtag': 'bioinformatics'},
 {'count': 67, 'hashtag': 'python'},
 {'count': 63, 'hashtag': 'genomics'},
 {'count': 60, 'hashtag': '100daysofcode'},
 {'count': 60, 'hashtag': 'research'},
 {'count': 51, 'hashtag': 'rstats'},
 {'count': 49, 'hashtag': 'coding'},
 {'count': 48, 'hashtag': 'machinelearning'},
 {'count': 46, 'hashtag': 'covid19'},
 {'count': 45, 'hashtag': 'immunology'},
 {'count': 45, 'hashtag': 'serverless'},
 {'count': 41, 'hashtag': 'fairdata'},
 {'count': 39, 'hashtag': 'linux'},
 {'count': 39, 'hashtag': 'citizenscience'}]


#### 16. Get the most “important” user in the dataset (use Graph algorithms: Pagerank, Betweenness centrality, etc. ). You will apply these algorithms in the mention network (which includes retweets)

In [21]:
userMentionsGraphExists = graph.execute(f"call gds.graph.exists('userMentionsGraph') yield graphName, exists return exists").data()[0]["exists"]
if userMentionsGraphExists:
    graph.execute(f"call gds.graph.drop('userMentionsGraph') YIELD graphName")
#Give a weight on each user -> user relationship depending on their follower count:
maxFollowersValue = graph.execute(f"match (users:{l_user}) RETURN max(users.{p_user[1]}) as max").data()[0]['max']              #Get the max value to normalize the weight between 0 and 1
graph.execute(f"match (:{l_user})-[r:{r_mentioned}]->(users:{l_user}) SET r.weight = (users.{p_user[1]} * 1.0 + 0.1) / {maxFollowersValue}")  #Set the weight to relationship

#If user userMentionsGraph graph is not stored in the catalog, create it:
graph.execute(f"call gds.graph.create('userMentionsGraph','{l_user}','{r_mentioned}',{{nodeProperties: '{p_user[1]}',relationshipProperties:'weight'}})")
#Get important users with PageRank:
importantUsers = graph.execute(f'''CALL gds.pageRank.stream('userMentionsGraph', {{ relationshipWeightProperty: 'weight' }})
                                    YIELD nodeId, score
                                    RETURN gds.util.asNode(nodeId).{p_user[0]} AS user, score
                                    ORDER BY score DESC, user ASC limit 5''').data()
print("Most important user: " + importantUsers[0]['user'])
print("Top 5 users:")
pprint([usr['user'] for usr in importantUsers])

Most important user: inaturalist
Top 5 users:
['inaturalist', 'doctorow', 'SueReviews', 'WHCEQ', 'PLOSONE']


#### 17. For the 5th most important user, get the list of hashtags and URLs that have been posted

In [169]:
if not 'importantUsers' in vars():
    raise Exception("Execute the answer to question 16 first to get the most important users")
selectedUser = importantUsers[4]["user"] #Get 5th most important user
results = graph.execute(f"match (user:{l_user})-[r]->(nodes) where (nodes:{l_hashtag} or nodes:{l_url}) and user.{p_user[0]} = '{selectedUser}' return  nodes.text as hashtag, nodes.url as url").data()
print(f"5th Most important user:\n{selectedUser}")
print("Hashtags used:")
pprint([r['hashtag'] for r in results if r['hashtag'] != None ])
print("Urls used:")
pprint([r['url'] for r in results if r['url'] != None ])

5th Most important user:
PLOSONE
Hashtags used:
['openscience']
Urls used:
[]


#### 18. Get the users that post tweets with hashtags most similar to those used by the most important user

In [32]:
if not 'importantUsers' in vars():
    raise Exception("Execute the answer to question 16 first to get the most important users")
hashtags = []
user = []
selectedUser = importantUsers[0]['user']
result = graph.execute(f"MATCH (u:{l_user})-[r:{r_used_hashtag}]->(h:{l_hashtag}) where u.{p_user[0]} = '{selectedUser}' return h.text as hashtag")
for row in result:
    hashtags.append(row["hashtag"])
    #get users that used most the most similar hashtags (but not the same) with most important user
    for hashtag in hashtags:
        result = graph.execute(f'''MATCH (u)-[r:{r_used_hashtag}]->(h:{l_hashtag})
                                    with apoc.text.sorensenDiceSimilarity('{hashtag}', h.text) as similarity, u.{p_user[0]} as screen_name, h.text as hashtag
                                    where similarity > 0.8 and similarity <> 1
                                    return screen_name, hashtag, similarity LIMIT 5''')
        for row in result:
            user.append([row["screen_name"],row["hashtag"],hashtag,row["similarity"]])
user.sort(key=lambda x: x[2], reverse=True)
user.insert(0,['User','Hashtag','Hashtag used by important user','Similarity'])
pprint(user)

[['User', 'Hashtag', 'Hashtag used by important user', 'Similarity'],
 ['KeithPiccard', 'scienceed', 'science', 0.8571428571428571],
 ['nettie087', 'sciences', 'science', 0.9230769230769231],
 ['OpenSciTalk', 'sciences', 'science', 0.9230769230769231],
 ['amarois', 'sciences', 'science', 0.9230769230769231],
 ['oxygenases', 'myscience', 'science', 0.8571428571428571],
 ['CitSciOZ', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['CitSciWA', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['SciStarter', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['GrundyLibrary', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['citnatchallenge', 'citizenscienceforall', 'citizenscience', 0.8125],
 ['CitSciOZ', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['CitSciWA', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['SciStarter', 'citizensciencekits', 'citizenscience', 0.8666666666666667],
 ['GrundyLibrary', 'cit

#### 19. Get the user communities that have been created based on the users’ interactions and visualise them (Louvain algorithm)

In [171]:
userMentionsGraphExists = graph.execute(f"call gds.graph.exists('GraphforLouvain') yield graphName, exists return exists").data()[0]["exists"]
if userMentionsGraphExists:
    graph.execute(f"call gds.graph.drop('GraphforLouvain') YIELD graphName")

#If user userMentionsGraph graph is not stored in the catalog, create it:
graph.execute(f"call gds.graph.create('GraphforLouvain','{l_user}',{{{r_mentioned}:{{orientation: 'UNDIRECTED'}}}},{{ nodeProperties: 'followers_count' }})")
graph.execute(f'''call gds.louvain.write('GraphforLouvain', {{writeProperty: 'community'}})''')
communities = graph.execute(f"match (users:{l_user}) return users.community as communityID, users.screen_name as user order by communityID").data()
communitiesCount = graph.execute(f"match (users:{l_user}) return count(distinct users.community) as communityCount").data()[0]['communityCount']
communitiesWithMoreThanOneUser = graph.execute(f"match (users:{l_user})-[:{r_mentioned}]-(:{l_user}) return count(distinct users.community) as communitiesWithMoreThanOneUser").data()[0]['communitiesWithMoreThanOneUser']
#Group results by communityID:
results = {}
for community in communities:
    if community['communityID'] not in results:
        results[community['communityID']] = []
    # Add users to their community id:
    results[community['communityID']].append(community['user'])
print("To visualize the communities graph, execute the query bellow in Neo4J Browser (single user groups are excluded):")
print("\n-----(Start >)-----")
print(f"match (users:{l_user})-[:{r_mentioned}]-(:{l_user}) return users.community as communityID, users as user order by communityID")
print("-----( < End )-----\n")
print(f"Total number of communities: {communitiesCount}")
print(f"Communities with more than one users: {communitiesWithMoreThanOneUser}")
print("\nAll communities:")
print("{communityID:[users in community]}")
pprint(results)

To visualize the communities graph, execute the query bellow in Neo4J Browser (single user groups are excluded):

-----(Start >)-----
match (users:user)-[:mentioned]-(:user) return users.community as communityID, users as user order by communityID
-----( < End )-----

Total number of communities: 10189
Communities with more than one users: 1111

All communities:
{communityID:[users in community]}
{5: ['hebbianloop'],
 9: ['ChronList'],
 12: ['realitycrafter'],
 19: ['mina_radman'],
 20: ['cgonzagaj'],
 22: ['Sunny2daySunny', 'Paull2', 'TheRealKeean'],
 27: ['MarvinSchmittML'],
 30: ['samuelu15158272'],
 31: ['surfkt'],
 33: ['patrickm02L'],
 34: ['aybajye'],
 35: ['mikeleiyanke'],
 38: ['tripl3check', 'fayeflam', 'mosermr'],
 45: ['YCollaud'],
 46: ['Jhanne45'],
 47: ['donatas_s'],
 52: ['EcuaCorrienteSA'],
 53: ['kbguzzo'],
 55: ['jorantes12'],
 57: ['Ecsa_Enfermeras'],
 58: ['QilinWang6'],
 61: ['TangaroaBlue'],
 62: ['garryd50'],
 64: ['AnkeMLeitzgen'],
 65: ['znjrpfelyutkl'],
 66: 

#### 20. Try to visualise the subgraph of users that have used the 5th most common hashtag

In [18]:
# Get the 5th most common hashtag:
hashtag = graph.execute(f"match (hashtags:{l_hashtag}) return distinct toLower(hashtags.text) as hashtag, size(()-[]->(hashtags)) as indegree order by indegree descending limit 5").data()[4]["hashtag"]
print("Execute command below on neo4j Browser to visualize the selected subgraph:")
print("\n-----(Start >)-----")
print(f'''match (users:{l_user})-[r1:{r_used_hashtag}]-(hashtags:{l_hashtag}) where hashtags.text = '{hashtag}'
    CALL apoc.path.subgraphAll(users, {{maxLevel: 1}}) YIELD nodes, relationships
    return nodes,relationships''')
print("-----( < End )-----\n")
print(f"The 5th most common hashtag: '{hashtag}'")
print("\nUsers that used the 5th most common hashtag:")
pprint([res['users'] for res in graph.execute(f"match (users:{l_user})-[r1:{r_used_hashtag}]-(hashtags:{l_hashtag}) where hashtags.text = '{hashtag}' return users").data()])

Execute command below on neo4j Browser to visualize the selected subgraph:

-----(Start >)-----
match (users:user)-[r1:used_hashtag]-(hashtags:hashtag) where hashtags.text = 'airpollution'
    CALL apoc.path.subgraphAll(users, {maxLevel: 1}) YIELD nodes, relationships
    return nodes,relationships
-----( < End )-----

The 5th most common hashtag: 'airpollution'

Users that used the 5th most common hashtag:
[{'community': 20344,
  'followers_count': 13220,
  'screen_name': 'JeanLambertLDN'},
 {'community': 18484,
  'followers_count': 2589,
  'screen_name': 'DanWils42696605'},
 {'community': 2542, 'followers_count': 0, 'screen_name': 'Tunieej'},
 {'community': 10714, 'followers_count': 19832, 'screen_name': 'rstatstweet'},
 {'community': 10714, 'followers_count': 1660, 'screen_name': 'Lala35539674'},
 {'community': 10714, 'followers_count': 3429, 'screen_name': 'ImNickHuber'},
 {'community': 10714, 'followers_count': 483, 'screen_name': 'protontypes'},
 {'community': 20486, 'followers_c