# Community graph

## Imports

In [150]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

import seaborn
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import networkx as nx
import community.community_louvain as community_louvain

## Reading data

In [151]:
df_t = pd.read_csv(os.path.join('data', 'clean_tweets.csv'))
df_rtt = pd.read_csv(os.path.join('data', 'clean_retweets.csv'))

In [152]:
del df_t['Unnamed: 0']
del df_t['index']

del df_rtt['Unnamed: 0']
del df_rtt['index']

In [153]:
df_rtt.columns

Index(['tweet_id', 'retweet_author_id', 'retweet_author_screen_name',
       'retweet_date'],
      dtype='object')

In [154]:
df_t.columns

Index(['author_id', 'author_screen_name', 'status_id', 'created_at', 'body',
       'lang', 'favorite_count', 'retweet_count'],
      dtype='object')

In [155]:
def remove_not_connected(G):
    """
    only keeps the largest connected component of the graph and returns a copy of it

    Parameters
    ----------
        G : networkx.Graph
            the graph

    returns
    -------
        G : networkx.Graph
            the filtered graph
    """
    num_connected_components = nx.number_connected_components(G)
    connected_components = nx.connected_components(G)

    l = []
    for i in nx.connected_components(G):
        l.append(len(i))
    l.sort(reverse = True)
    
    print(r"""There are {} connected components. The largest has {} nodes and the second {}""". format(num_connected_components, l[0], l[1]))
    max_cc = max(connected_components, key = len)

    return G.subgraph(max_cc).copy()

def generate_subgraphs(G):
    """
    Generates the Louvain partitions of the given graph.

    Parameters
    ----------
        G : networkx.Graph
            the graph

    Returns
    -------
        subgraphs : list of networkx.Graph
            the list of all partitions generated by the Louvain algorithm
    """
    partitions = community_louvain.best_partition(G)
    num_partitions = len(np.unique(list(partitions.values())))
    print(r"""Number of partition from Louvain: {}""".format(num_partitions))

    sets = []

    for i in range(num_partitions):
        sets.append(set())

    for k,v in partitions.items():
        sets[v].add(k)

    subgraphs = [G.subgraph(s).copy() for s in sets]
    return subgraphs

def save_graphs(graphs, path = 'graphs', threshold = 20):
    """
    Saves the graphs in a folder given by the path. The files are of the form Gx_y.gexf,
    where x is the graph number and y the number of elements in the graphs.

    warning: it empties the folder given by the path argument if not empty

    Parameters
    ----------
        graphs : list of networkx.Graph
            the graphs to save

        path : str
            The folder in which to store the graphs

        threshold : int
            minimal number a subgraph needs to have to be kept

    """
    if not os.path.exists(path):
        os.mkdir(path)
    else:
        for f in os.listdir(path):
            os.remove(os.path.join(path, f))

    for (i, subgraph) in enumerate(graphs):
        N = len(subgraph)
        if(N > threshold):
            nx.write_gexf(subgraphs[i], os.path.join('graphs', 'G{}_{}.gexf'.format(i, N)))

In [156]:
## get ids that will have kept their screen names in authors of statuses
author_ids = np.unique(df_t['author_id'])
authors_rtt_ids = np.unique(df_rtt['retweet_author_id'])
authors = np.unique(np.concatenate((author_ids, authors_rtt_ids)))
print(len(authors))

41976


In [157]:
## get ids that will have kept their screen names in authors of retweets
l2 = df_rtt.groupby('retweet_author_id').count()['tweet_id'] > 50
filtered_rtt_ids = authors_rtt_ids[l2]
print(len(filtered_rtt_ids))

l1 = df_t.groupby(['author_id']).agg(len)['status_id'] > 10
filtered_t_ids = author_ids[l]
print(len(filtered_t_ids))

438
229


In [158]:
filtered_ids = np.unique(np.concatenate((filtered_t_ids, filtered_rtt_ids)))
print(len(filtered_ids))

602


In [159]:
d = {}

for id_ in filtered_ids:
    if id_ in np.array(df_t['author_id']):
        screen_name = df_t[df_t['author_id'] == id_].iloc(0)[0].author_screen_name
    else:
        screen_name = df_rtt[df_rtt['retweet_author_id'] == id_].iloc(0)[0].retweet_author_screen_name
    
    d[id_] = screen_name

In [160]:
G = nx.Graph()
 
## adding nodes
for a in authors:
    if a in d.keys():
        G.add_node(a, screen_name = d[a])
    else:
        G.add_node(a, screen_name = '')
    

In [161]:
for i in range(len(df_t)):
    status_id = df_t['status_id'][i]
    tweet_author_id = df_t['author_id'][i]
    
    retweet_author_ids = df_rtt[df_rtt['tweet_id'] == status_id]['retweet_author_id']
    for retweet_author_id in retweet_author_ids:
        G.add_edge(tweet_author_id, retweet_author_id)

In [162]:
G = remove_not_connected(G)

There are 1006 connected components. The largest has 40947 nodes and the second 7


In [163]:
subgraphs = generate_subgraphs(G)

Number of partition from Louvain: 15


In [165]:
subgraphs.append(G)

In [166]:
save_graphs(subgraphs)