# Assignment 4

Important to remember: 

You are free to explore anything you deem interesting and present your findings in your report. The
main goal is to get familiar with Cypher but also to hone your "storytelling" skills. In that sense, try to
focus on a single or a few hypotheses or findings you explore in full (with nicely formatted visualizations)
and explaining what it says instead of just going for quick filter saying: "here are the three nodes with
the most connections" (boring) or showing a graph hairball.


Pick something you want to explore and try to work this out in full.
You will realize that the data set is likely too large to look at everything at once, so a guided “deep dive”
will work better. Also, obviously most of the messages will be in Dutch or French, but that should not be
too much of a burden to do something interesting.

Hypothesis: 
- far left and far right have more interaction + relatively more active engagement (think retweet, requote)
- central parties verly little engagement because older population not very present on twitter

In [None]:
#Load packages
import pandas as pd
import numpy as np
import networkx as nx
import itertools

In [None]:
file_location = "memgraph-query-results-export.graphml"
graph = nx.read_graphml(file_location)

Add the party of the politician. This can be used in Gephi to identify politicians vs normal twitter users

In [None]:
# Iterate through the edges
# Iterate through the edges
for source, target, edge_data in graph.edges(data=True):
    # Check if the edge represents membership
    if edge_data.get("edge_type") == "MEMBER_OF":
        # Get the party name from the source node
        party_node_data = graph.nodes.get(target)
        #print(party_node_data)
        if party_node_data:
            party_name = party_node_data.get("Party_label", None)
            if party_name:
                # Assign the party name as a new attribute to the source node
                graph.nodes[source]["Party_name"] = party_name


As an illustration, this is the type of nodes and edges that we are working with

In [None]:

# Print the first 10 nodes
print("First 10 nodes:")
count_nodes = 0
for node in graph.nodes(data=True):
    print(node)
    count_nodes += 1
    if count_nodes >= 1000:
        break  # Break after printing the first 50 nodes

# Print the first 10 edges
print("First 10 edges:")
count_edges = 0
for edge in graph.edges(data=True):
    print(edge)
    count_edges += 1
    if count_edges >= 10:
        break  # Break after printing the first 50 edges

Here is some code that is used to perform checks with:

In [None]:
# id to check edges
id="352012"

for source, target, edge_data in graph.edges(data=True):
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

In [None]:
# checks for the nodes
id = "23466"
node_data = graph.nodes[id]
print(f"Node ID: {id}, Node Data: {node_data}")

In [None]:
# Define the path where you want to save the file
#file_path = "/Users/sarahguilliams/Desktop/Advanced Analytics in a Big Data World/Assignment 4/modified_graph_party_names.graphml"

# Save the modified graph in .graphml format
#nx.write_graphml(graph, file_path)


Now, try to skip the tweet node and look at the direct interactions between politicians and their twitter account vs the twitter account of someone that interacted with them.

From these checks, it becomes obvious that some tweets that the politican posted, are replies to their own tweets. 

In [None]:
# id to check edges
id = "338704" #A tweet posted by a politician, which is a reply to a tweet by a twitter user

for source, target, edge_data in graph.edges(data=True):
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

print("\nThe politician that posted the tweet:")
id = "358639"
node_data = graph.nodes[id]
print(f"Node ID: {id}, Node Data: {node_data}")

print("\nThe tweet in question:")
id = "338704"
node_data = graph.nodes[id]
print(f"Node ID: {id}, Node Data: {node_data}")

print("\n This is a reply to this tweet:")
id = "338674"
node_data = graph.nodes[id]
print(f"Node ID: {id}, Node Data: {node_data}")

# This is still an interaction that should be taken into consideration!!!

New code:

In [None]:

graph2 = graph.copy()

# Create a list to store edges to remove
edges_to_remove = []
# Create a list to store edges to add
edges_to_add = []

edges = list(graph2.edges(data=True))

# Iterate over the edge list
for source, target, edge_data in edges:
    # Check if the edge represents a reply to a tweet
    if edge_data.get("edge_type") == "REPLY_TO" or edge_data.get("edge_type") == "QUOTE_OF":
        
        reply_tweet_source_id = source # Get the source tweet (i.e. the tweet posted by the user)
        reply_tweet_target_id = target # Get the target tweet (i.e. the tweet posted by the politician)

        print(f"TWEET SOURCE: {reply_tweet_source_id}")
        print(f"TWEET TARGET: {reply_tweet_target_id}")     
           
        # Search for edges that end at the reply_tweet node and have edge_type "POSTED"
        for u, v, e_data in graph2.in_edges(reply_tweet_source_id, data=True):
            print(f"{u}, {v}, {e_data}")
            print("Debug1")
            if e_data.get("edge_type") == "POSTED":
                print("Debug2")
                if 'Party_name' not in graph2.nodes[u]: # The user should not be a politician
                    print("Debug3")
                    user_account = u
                    print(f"Adding edge from {user_account} to {reply_tweet_target_id}")
                    # Add an edge between the user and the original tweet being replied to
                    edges_to_add.append((user_account, reply_tweet_target_id, "INTERACTED_W_TWEET"))
                    
                    # Add edges to remove
                    edges_to_remove.append((reply_tweet_source_id, reply_tweet_target_id))
                    edges_to_remove.append((user_account, reply_tweet_source_id))
                else: # because sometimes the politician replied to the tweet
                    for w, x, e_data2 in graph2.in_edges(reply_tweet_target_id, data=True):
                        print(f"{w}, {x}, {e_data2}")
                        print("Debug4")
                        if e_data2.get("edge_type") == "POSTED":
                            print("Debug5")
                            if 'Party_name' not in graph2.nodes[w]: # The user should not be a politician
                                print("Debug6")
                                user_account = w
                                print(f"Adding edge from {user_account} to {reply_tweet_source_id}")
                                # Add an edge between the user and the original tweet being replied to
                                edges_to_add.append((user_account, reply_tweet_source_id, "INTERACTED_W_TWEET"))
                                
                                # Add edges to remove
                                edges_to_remove.append((reply_tweet_source_id, reply_tweet_target_id))
                                edges_to_remove.append((user_account, reply_tweet_target_id))

# Add new edges
for u, v, edge_type in edges_to_add:
    graph2.add_edge(u, v, edge_type=edge_type)

# Remove edges
graph2.remove_edges_from(edges_to_remove)


Some nodes need to be removed if they have no edges anymore

In [None]:
nodes_to_remove = []

# Iterate over nodes in graph2
for node, node_data in graph2.nodes(data=True):
    # Check if the node has node_type 'Tweet' and no edges
    if node_data.get('node_type') == 'Tweet' and graph2.degree(node) == 0:
        # Add the node to the list of nodes to remove
        nodes_to_remove.append(node)

# Remove nodes with no edges from graph2
graph2.remove_nodes_from(nodes_to_remove)

## Checks to see if the code works well
### check 1: politician: should not have the new edge interacted_w_tweet

In [None]:
# id to check edges
id = "345384"

for source, target, edge_data in graph2.edges(data=True):
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

### Check one of the tweets by the politician. 

In [None]:
id="70155"

for source, target, edge_data in graph2.edges(data=True): 
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

### Check a twitter user that is not a politician

In [None]:
# id to check edges
id="352012"

for source, target, edge_data in graph2.edges(data=True): 
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

In [None]:
# Counter to keep track of the number of printed edges
count = 0

# Iterate through the edges to find INTERACTED_WITH edges
for source, target, edge_data in graph2.edges(data=True):
    if edge_data.get("edge_type") == "INTERACTED_W_TWEET":
        print(f"Tweet Politician {target} <-- INTERACTED_W_TWEET -- User_account {source}")
        count += 1
        if count >= 50:
            break

Save the intermediate file if you want

In [None]:
# Define the path where you want to save the file
#file_path = "/Users/sarahguilliams/Desktop/Advanced Analytics in a Big Data World/Assignment 4/modified_graph_interaction_with_tweet.graphml"

# Save the modified graph in .graphml format
#nx.write_graphml(graph2, file_path)

# Now look at the direct interaction between the politician and the user

Now create graph 3, which looks at the direct interaction between a politican and a twitter user. There could be more than one interaction, so the edges need to have weights corresponding to the number of interactions that occured. 

### Adding weights depending on how many times there was an interaction

In [None]:
# Start from graph 2 and make a copy
graph3 = graph2.copy()

# Create a list to store edges to remove
edges_to_remove = []
edges = list(graph3.edges(data=True))

# Iterate over the edge list
for source, target, edge_data in edges:
    if edge_data.get("edge_type") == "INTERACTED_W_TWEET":
        user_account_id = source  # Get the source (i.e. the user account that is not a politician that posted the tweet)
        tweet_id = target  # Get the target (i.e. the tweet that was posted by the politician)

        print(f"USER ACCOUNT: {user_account_id}")
        print(f"TWEET BY POLITICIAN: {tweet_id}")

        # Search for edges that end at the tweet node and have edge_type "POSTED"
        for u, v, e_data in graph2.in_edges(tweet_id, data=True):
            if e_data.get("edge_type") == "POSTED":
                politician_account = u  # Then politician_account is the source
                print(f"Adding/incrementing edge from {user_account_id} to {politician_account}")

                # Add or increment an edge between the user and the politician
                if graph3.has_edge(user_account_id, politician_account):
                    # Increment the weight if the edge already exists
                    if 'weight' in graph3[user_account_id][politician_account]:
                        graph3[user_account_id][politician_account]['weight'] += 1
                    else:
                        graph3[user_account_id][politician_account]['weight'] = 2  # If there's already an edge without weight, initialize to 2
                else:
                    # Add the edge with an initial weight of 1
                    graph3.add_edge(user_account_id, politician_account, edge_type="INTERACTED_W_POLITICIAN", weight=1)

                # Add edges to remove
                edges_to_remove.append((user_account_id, tweet_id))
                edges_to_remove.append((politician_account, tweet_id))

graph3.remove_edges_from(edges_to_remove)

In [None]:
# Remove the loose nodes
nodes_to_remove = []

# Iterate over nodes in graph2
for node, node_data in graph3.nodes(data=True):
    # Check if the node has node_type 'Tweet' and no edges
    if node_data.get('node_type') == 'Tweet' or graph3.degree(node) == 0:
        # Add the node to the list of nodes to remove
        nodes_to_remove.append(node)

# Remove nodes with no edges from graph2
graph3.remove_nodes_from(nodes_to_remove)

#### Check politician
They should still be a member and then interact with different twitter user accounts

In [None]:
# id to check edges
id = "351970"

for source, target, edge_data in graph3.edges(data=True):
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

# For this interaction, the only ones that remain are tweets that are posted by the politician as a reply to their own tweets:

In [None]:
#counting the number of tweeets by each politician and number of politicians
#based on interactions between politicians and other users
counting_pol_tweets = {}
for source, target, edge_data in graph3.edges(data=True):
    if edge_data.get("edge_type")== 'INTERACTED_W_POLITICIAN':
        if target not in counting_pol_tweets:
            counting_pol_tweets[target] = 1
        counting_pol_tweets[target]  += 1
len(counting_pol_tweets)

In [None]:
#counting the number of polticians based on party membership
counting_pols = {}
for source, target, edge_data in graph3.edges(data=True):
    if edge_data.get("edge_type")== 'MEMBER_OF':
        if source not in counting_pols:
            counting_pols[source] = 1
        counting_pols[source]  += 1  
len(counting_pols)

In [None]:
#find the politician without any interactions
keys_pols = set(counting_pols.keys())
keys_pol_tweets = set(counting_pol_tweets.keys())

no_tweet_pols = keys_pols-keys_pol_tweets
no_tweet_pols

In [None]:
#check in graph 3 for politician with id = 360395
count = 0
for source, target, edge_data in graph3.edges(data=True):
    for id in no_tweet_pols:
        if source == id or target == id:
            # Print the edge information
            if edge_data.get('edge_type') == "POSTED":
                print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")
                print(f'{source},{graph3.nodes[source]['node_type']}')
                print(f'{target},{graph3.nodes[target]['node_type']}')
                count +=1
               
print(count)

In [None]:
# id to check edges
#check for original graph
count = 0
for source, target, edge_data in graph.edges(data=True):
    for id in no_tweet_pols:
        if source == id or target == id:
            # Print the edge information
            if edge_data.get('edge_type') == "POSTED":
                print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")
                print(f'{source},{graph.nodes[source]['followers_count']}')
                print(f'{target},{graph.nodes[target]}')
                count +=1
               
print(count)
#the politician had one tweet in the original graph but this one
#has no interactions (replies or quotes) so it is correct to exclude it

### Check a twitter user that is not a politician

In [None]:
# id to check edges
id = "354726" # The user => only has the interacted with tweet edge
for source, target, edge_data in graph3.edges(data=True):
    if source == id or target == id:
        # Print the edge information
        print(f"Edge: {source} -> {target}, Edge Data: {edge_data}")

In [None]:
# Saving the new graph
# Define the path where you want to save the file
#file_path = "/Users/sarahguilliams/Desktop/Advanced Analytics in a Big Data World/Assignment 4/0521_graph.graphml"

# Save the modified graph in .graphml format
#nx.write_graphml(graph3, file_path)

## Analyse the graph

In [None]:
louv_com =nx.community.louvain_communities(graph3, weight='weight', seed=12, resolution=0.5)

In [None]:
len(louv_com)

In [None]:
#assiging communities as graph feature
community_nr = 1
for community in louv_com:
    print(community_nr)
    for node in community:
        print(node)
        graph3.nodes[node]['community']=community_nr
    community_nr += 1

In [None]:
def intra_inter(graph):
    intra = {}
    inter = {}

    for u, v, edge_data in graph.edges(data=True):
        com_node1 = graph.nodes[u]['community']
        com_node2 = graph.nodes[v]['community']
        if edge_data.get('weight'):
            adding = edge_data.get('weight')
        else:
            adding = 1
        if com_node1 == com_node2:
            if com_node1 not in intra:
                intra[com_node1]=adding
            else:
                intra[com_node1]+=adding
        else:
            if (com_node1, com_node2) not in inter:
                if (com_node2, com_node1) not in inter:
                    inter[(com_node1, com_node2)]=adding
                else:
                    inter[(com_node2, com_node1)]+=adding
            else:
                inter[(com_node1, com_node2)]+=adding
        
    return intra, inter

In [None]:
intra, inter = intra_inter(graph3)
print(f'intra edges: {intra}')
print(f'inter edgdes:{inter}')

In [None]:
compare_intra_inter = {}
for community in intra:
    count_inter_edges = 0
    for edges in inter:
        if edges[0] == community or edges[1]== community:
            count_inter_edges += inter[edges]
    ratio = count_inter_edges/intra[community]
    compare_intra_inter[community] = (intra[community], count_inter_edges,ratio)

compare_intra_inter

In [None]:
modularity_louv = nx.community.modularity(graph3, louv_com)
modularity_louv

In [None]:
greed_com = nx.community.greedy_modularity_communities(graph3, weight = 'weight', resolution=0.5,
                                                       cutoff=3, best_n=10)

In [None]:
modularity_greed = nx.community.modularity(graph3, greed_com)
modularity_greed

In [None]:
girv_com = nx.community.girvan_newman(graph3)

In [None]:
modularity_girv = nx.community.modularity(graph3, girv_com)
modularity_girv