Social Media Analytics Project 8 - Community Detection in a Twitter Network
===
Goloviatinski Sergiy, Herbelin Ludovic <br />
MCS 2020

In [93]:
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import networkx as nx
from math import sqrt, log
from tqdm.notebook import trange, tqdm
import random
import os

## Dataset loading

In [2]:
DATA_COMBINED_PATH = './data/twitter_combined.txt'
DATA_OTHERS = './data/twitter/'

In [70]:
# key: node_id, value: set of social circle ids
social_circles={}
i=0
for filename in os.listdir(DATA_OTHERS):
    if filename.split('.')[-1]=='circles':
        ego_node=filename.split('.')[0]
        social_circles[ego_node]=set()
        with open(f'{DATA_OTHERS}/{filename}') as file:

            data = file.read()
            for line in data.split('\n'):
                nodes=line.split()[1:]
                for node in nodes:
                    try:
                        social_circles[node].add(i)
                    except KeyError:
                        social_circles[node]=set()
                        social_circles[node].add(i)
                social_circles[ego_node].add(i)
                i+=1

In [4]:
original_G = nx.Graph()

edges = nx.read_edgelist(DATA_COMBINED_PATH)

original_G.add_edges_from(edges.edges())

for node in list(original_G.nodes):
    if node not in social_circles.keys():
        original_G.remove_node(node)

print(f"Number of nodes : {len(original_G.nodes)}")
print(f"Number of edges : {len(original_G.edges())}")

Number of nodes : 23391
Number of edges : 456557


### Reducing the size of the graph

In [5]:
def random_walk(G,n):
    
    node=random.choice(list(social_circles.keys()))
    
    visited = set()
    visited.add(node)
    
    while len(visited)<n:
        node=random.choice(list(G.neighbors(node)))
        visited.add(node)
    
    visited=list(visited)
    
    # we copy the graph because some attributes are shared with the original graph after calling subgraph method
    G=G.copy()
    return G.subgraph(visited)

In [59]:
# Reduce the graph size, with random walk
N_NODES = 20

G = random_walk(original_G,N_NODES)

print(f"Number of nodes : {len(G.nodes)}")
print(f"Number of edges : {len(G.edges())}")

Number of nodes : 20
Number of edges : 35


## Implementation

### Girvan-Newman

In [20]:
def compute_gn(G, n_iter):
    G=G.copy()
    
    n_communities=0
    
    for i in range(n_iter):
        while n_communities<=i+1:
            betweennesses=nx.edge_betweenness_centrality(G,normalized=False)

            edge_to_remove = max(betweennesses, key=betweennesses.get)
            G.remove_edge(edge_to_remove[0],edge_to_remove[1])
            n_communities=len(list(nx.connected_components(G)))
    
    communities=list(nx.connected_components(G))
    return dict(zip(list(range(len(communities))),communities))

### Cosine similarity

In [8]:
def cosine_sim(vi_neighbors, vj_neighbors):
    return len(vi_neighbors.intersection(vj_neighbors)) / sqrt(len(vi_neighbors) * len(vj_neighbors))

def compute_cosine_sim(G, selected_nodes):
    nodes_similarities = {}
    
    # TODO : optimize not to compute multiple times the same product maybe triangular matrix
    for node in selected_nodes:
        vi_neighbors = set(G[node])
        for neighbor in vi_neighbors:
            vj_neighbors = set(G[neighbor])
            sim = cosine_sim(vi_neighbors, vj_neighbors)
            nodes_similarities[(node, neighbor)] = sim
    
    return nodes_similarities

### Adamic-Adar similarity

In [9]:
def adamic_adar_sim(G, vi_neighbors, vj_neighbors):
    common_neighbors = vi_neighbors.intersection(vj_neighbors)
    
    # sum of 1 / log(nb of neighbors for each common neighbor to vi and vj)
    return sum([1 / log(len(G[neighbor])) for neighbor in common_neighbors])
        

def compute_adamic_adar_sim(G, selected_nodes):
    nodes_similarities = {}
    
    # TODO : optimize not to compute multiple times the same product maybe triangular matrix
    for node in selected_nodes:
        vi_neighbors = set(G[node])
        for neighbor in vi_neighbors:
            vj_neighbors = set(G[neighbor])
            sim = adamic_adar_sim(G, vi_neighbors, vj_neighbors)
            nodes_similarities[(node, neighbor)] = sim
    
    return nodes_similarities

## Analysis

### Compute the clusters and evaluate with different values of iteration level

In [21]:
iterations=4
communities=compute_gn(G,iterations)
print(communities)

{0: {'250340951', '262764726', '182455769', '17654192', '376946114', '270449528', '195475105', '302847930'}, 1: {'94480069', '351092905', '294198566', '254457417', '152065057', '113298003'}, 2: {'15099384', '19636959', '26202686'}, 3: {'22464533'}, 4: {'19479431', '19637934'}}


In [11]:
# to compare with method from networkx
from networkx.algorithms.community.centrality import girvan_newman

comp = girvan_newman(G)

for i in range(iterations-1):
    next(comp)
for com in next(comp):
    print(com)

{'250340951', '262764726', '182455769', '17654192', '376946114', '270449528', '195475105', '302847930'}
{'94480069', '351092905', '294198566', '254457417', '152065057', '113298003'}
{'15099384', '19636959', '26202686'}
{'22464533'}
{'19479431', '19637934'}


## Find the top K users

In [12]:
k = 10

if k > len(G):
    print(f"Warning : K chosen : {k} is higher than the number of nodes in the graph : {len(G)}\n")

nodes_degrees = dict(G.degree())
# normalize the node degrees using the max node degree
max_deg = max(nodes_degrees.values())
nodes_degrees = {node:deg /float(max_deg) for node, deg in nodes_degrees.items()}

    
# sort the node:degree dictionary
nodes_degrees= {node: deg for node, deg in sorted(nodes_degrees.items(), key=lambda item: item[1], reverse=True)}


top_nodes = list(nodes_degrees)[:k]
top_nodes_dict = {node:nodes_degrees[node] for node in top_nodes}
print(f"{k} nodes with highest degree : {top_nodes_dict}")

10 nodes with highest degree : {'270449528': 1.0, '182455769': 0.8571428571428571, '302847930': 0.8571428571428571, '17654192': 0.7142857142857143, '195475105': 0.7142857142857143, '94480069': 0.5714285714285714, '294198566': 0.5714285714285714, '376946114': 0.5714285714285714, '250340951': 0.5714285714285714, '254457417': 0.5714285714285714}


### Find the most similar nodes

In [13]:
similarities_tested = {
    'cosine':compute_cosine_sim,
    'adamic-adar':compute_adamic_adar_sim,
}


for similarity_label, similarity_func in similarities_tested.items():
    top_nodes_sims = similarity_func(G, top_nodes)
    most_similar_pair = max(top_nodes_sims, key=top_nodes_sims.get)
    
    print(f"Most similar nodes using function : {similarity_label} are {most_similar_pair} with similarity value : {top_nodes_sims[most_similar_pair]:.3f}") 

Most similar nodes using function : cosine are ('270449528', '302847930') with similarity value : 0.772
Most similar nodes using function : adamic-adar are ('270449528', '302847930') with similarity value : 3.243


## Visualization