Social Media Analytics Project 8 - Community Detection in a Twitter Network
===
Goloviatinski Sergiy, Herbelin Ludovic <br />
MCS 2020

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import networkx as nx
from math import sqrt, log
from tqdm.notebook import trange, tqdm

## Dataset loading

In [2]:
DATA_COMBINED_PATH = 'data/twitter_combined.txt'
DATA_OTHERS = 'data/twitter/'

In [3]:
G = nx.Graph()

edges = nx.read_edgelist(DATA_COMBINED_PATH)

G.add_edges_from(edges.edges())

print(f"Number of nodes : {len(G.nodes)}")
print(f"Number of edges : {len(G.edges())}")

Number of nodes : 81306
Number of edges : 1342310


### Reducing the size of the graph

In [4]:
# Reduce the graph size, drop arbitrarily (temporary solution)
N_NODES = 200

G = G.subgraph(list(G)[:N_NODES])

print(f"Number of nodes : {len(G.nodes)}")
print(f"Number of edges : {len(G.edges())}")

Number of nodes : 200
Number of edges : 9291


## Implementation

### Girvan-Newman

In [5]:
from networkx.algorithms.community.centrality import girvan_newman

def compute_gn(G, n_iter):
    comp = girvan_newman(G)

    for i in trange(n_iter):
        pass

### Cosine similarity

In [6]:
def cosine_sim(vi_neighbors, vj_neighbors):
    return len(vi_neighbors.intersection(vj_neighbors)) / sqrt(len(vi_neighbors) * len(vj_neighbors))

def compute_cosine_sim(G, selected_nodes):
    nodes_similarities = {}
    
    # TODO : optimize not to compute multiple times the same product maybe triangular matrix
    for node in selected_nodes:
        vi_neighbors = set(G[node])
        for neighbor in vi_neighbors:
            vj_neighbors = set(G[neighbor])
            sim = cosine_sim(vi_neighbors, vj_neighbors)
            nodes_similarities[(node, neighbor)] = sim
    
    return nodes_similarities

### Adamic-Adar similarity

In [11]:
def adamic_adar_sim(G, vi_neighbors, vj_neighbors):
    common_neighbors = vi_neighbors.intersection(vj_neighbors)
    
    # sum of 1 / log(nb of neighbors for each common neighbor to vi and vj)
    return sum([1 / log(len(G[neighbor])) for neighbor in common_neighbors])
        

def compute_adamic_adar_sim(G, selected_nodes):
    nodes_similarities = {}
    
    # TODO : optimize not to compute multiple times the same product maybe triangular matrix
    for node in selected_nodes:
        vi_neighbors = set(G[node])
        for neighbor in vi_neighbors:
            vj_neighbors = set(G[neighbor])
            sim = adamic_adar_sim(G, vi_neighbors, vj_neighbors)
            nodes_similarities[(node, neighbor)] = sim
    
    return nodes_similarities

## Analysis

### Compute the clusters

### Find the top K users

In [12]:
k = 10

top_nodes = list(G.nodes)[:k]

### Find the most similar nodes

In [13]:
similarities_tested = {
    'cosine':compute_cosine_sim,
    'adamic-adar':compute_adamic_adar_sim,
}


for similarity_label, similarity_func in similarities_tested.items():
    top_nodes_sims = similarity_func(G, top_nodes)
    most_similar_pair = max(top_nodes_sims, key=top_nodes_sims.get)
    
    print(f"Most similar nodes using function : {similarity_label} are {most_similar_pair} with similarity value : {top_nodes_sims[most_similar_pair]:.3f}") 

Most similar nodes using function : cosine are ('259842341', '358775055') with similarity value : 0.877 .
Most similar nodes using function : adamic-adar are ('259842341', '214328887') with similarity value : 32.373 .


## Visualization