In [8]:
import community as cd
import igraph as ig
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (15.0, 5.0)
matplotlib.rcParams.update({'font.size': 18})
import numpy as np
from collections import Counter, defaultdict
from operator import itemgetter
import random
import seaborn as sns
import powerlaw
import itertools
plt.style.use('ggplot')
import warnings

# Ignore two specific warning messages from the powerlaw package. 
# Comment these lines to show warnings
warnings.filterwarnings("ignore", message="invalid value encountered in true_divide")
warnings.filterwarnings("ignore", message="divide by zero encountered in true_divide")

In [9]:
# read network as directed graph from edge list
DF = pd.read_csv('clean_graph.csv', delimiter=",")

# Remove big subreddits that are in several communities
DF = DF[DF.SOURCE_SUBREDDIT != 'askreddit']
DF = DF[DF.SOURCE_SUBREDDIT != 'iama']
DF = DF[DF.TARGET_SUBREDDIT != 'askreddit']
DF = DF[DF.TARGET_SUBREDDIT != 'iama']



# Load into nx
G = nx.convert_matrix.from_pandas_edgelist(DF, source="SOURCE_SUBREDDIT", target="TARGET_SUBREDDIT", edge_attr="LINK_SENTIMENT", create_using=nx.DiGraph)

# print summary of network
print(nx.info(G))

Name: 
Type: DiGraph
Number of nodes: 35215
Number of edges: 131250
Average in degree:   3.7271
Average out degree:   3.7271


In [10]:
# Replacing strings by integers to load into igraph
nodes = set(DF.SOURCE_SUBREDDIT.unique()).union(set(DF.TARGET_SUBREDDIT.unique()))
nodes_dict = dict(zip(nodes, range(len(nodes))))
label_dict = dict([(value, key) for key, value in nodes_dict.items()])

In [11]:
DF_NUMS = DF.iloc[:60000,:].replace(to_replace=nodes_dict)

In [12]:
DF_NUMS2 = DF.iloc[60000:,:].replace(to_replace=nodes_dict)

In [13]:
DF_ALL = DF_NUMS.append(DF_NUMS2)

In [15]:
# Loading into igraph
igall = ig.Graph.TupleList(DF_ALL.itertuples(index=False), directed=True, edge_attrs="weight")
# Getting the largest strongly connected component
bc = igall.components().giant()
'''
Doing infomap for community detection.
Wanted to use the fast greedy algorithm, which merges individual nodes into communities in a way that greedily maximizes the modularity score of the graph.
I thought it would be a really good idea for our realtively large network, but it has no support for directed graphs (story of our lives))
'''
allvertex = bc.community_infomap(trials=10)

In [16]:
# Plotting the biggest strongly connected components
def plotcommunity(graph, name):
    visual_style = dict()
    layout = graph.layout_fruchterman_reingold()
    visual_style["layout"] = layout
    visual_style["bbox"] = (5000, 5000)
    visual_style["margin"] = 100
    #graph.vs['label'] = label_dict[graph.vs['name']]
    #visual_style["vertex_label"] = graph.vs['label']
    visual_style["vertex_label"] = [label_dict[x] for x in graph.vs["name"]]
    out = ig.plot(graph, **visual_style)
    out.save(name + '.png')

In [17]:
# Positive components
ss = allvertex.sizes() 
second = allvertex.subgraph(ss.index(sorted(ss, reverse=True)[1]))
third = allvertex.subgraph(ss.index(sorted(ss, reverse=True)[2]))
fourth = allvertex.subgraph(ss.index(sorted(ss, reverse=True)[3]))

In [19]:
fifth = allvertex.subgraph(ss.index(sorted(ss, reverse=True)[4]))
sixth = allvertex.subgraph(ss.index(sorted(ss, reverse=True)[5]))
plotcommunity(fifth, "A5th")
plotcommunity(sixth, "A6th")

In [18]:
#Plots
plotcommunity(allvertex.giant(), "A1st")
plotcommunity(second, "A2nd")
plotcommunity(third, "A3rd")
plotcommunity(fourth, "A4th")