In [1]:
from modules.dataset.entities import Entities
from modules.dataset.tweets import Tweets
from modules.network import HashNet

import networkx as nx
import numpy as np
import community

## Create network

In [2]:
TWEETS_DB_PATH = 'data/db/tweets.json'  # Path to tweets dataset
HASHTAGS_DB_PATH = 'data/db/hashtags.json' # Path to hashtags dataset

In [3]:
# Define networks container, indexed by periods
networks = {
    2017: None,
    2018: None,
    2019: None
}

In [4]:
# Import tweets dataset
tweets = Tweets()
tweets.from_json(TWEETS_DB_PATH)
tweets.df.head()

Unnamed: 0,tweet_id,tweet_date,tweet_text
0,836950901495631872,2017-03-01 14:46:59,TT SINGAPORE 22:46\n1.Hong Kong\n2.#JointAddre...
1,836950882528989184,2017-03-01 14:46:54,Letting #snapchat prepare me for the day's uns...
2,836950869639835649,2017-03-01 14:46:51,"""The bill would require the state to get all o..."
3,836950847380668416,2017-03-01 14:46:46,Style-Lead don't Follow #recycledfashion https...
4,836950839101116421,2017-03-01 14:46:44,‘Shell knew’: oil giant's 1991 film warned of ...


In [5]:
hashtags = Entities()
hashtags.from_json(HASHTAGS_DB_PATH)
hashtags.df.head()

Unnamed: 0,tweet_id,entity_index,entity_text,entity_tag,entity_conf
0,1101574442575167489,48,#ClimateChange,#,0.9615
1,1101574442575167489,49,#Science,#,0.9843
2,1101574446341607424,10,#climatechange,^,0.4659
3,1101574446341607424,26,#climateemergency,#,0.2876
4,1101574471247380480,16,#climatechange,N,0.5529


In [6]:
# Define the list of hashtags used as search seeds
seed_list = ["#climatechange", "#climate", "#sdgs", "#sustainability", "#environment", "#globalwarming"]

for period in networks.keys():
    # Subset tweets for current period
    curr_tweets = Tweets()
    curr_tweets.df = tweets.df.loc[tweets.df.tweet_date.dt.year == period].copy()
    # Get ids of tweets for current period
    curr_tweets_id = curr_tweets.df.tweet_id.unique()
    # Subset hashtags and words associated to current tweets
    curr_hashtags = Entities()
    curr_hashtags.df = hashtags.df.loc[hashtags.df.tweet_id.isin(curr_tweets_id)].copy()
    # Lower hashtags
    curr_hashtags.df["entity_text"] = curr_hashtags.df["entity_text"].str.lower()
    # Remove seeds from hashtag dataset
    curr_hashtags.df = curr_hashtags.df.loc[~curr_hashtags.df.entity_text.isin(seed_list)]
    #curr_words.df = curr_words.df.loc[~curr_words.df.entity_text.isin(seed_list)]
    # Generate a dictionary containing words and hashtags networks for each period
    networks[period] = HashNet.from_entities(curr_hashtags)
    
    del curr_hashtags, curr_tweets

In [7]:
for period in networks.keys():
    net_type = "hashtags"
    # Get the list of connected components sorted by size
    cc = networks[period].get_connected_components()
    # If there is only one cc
    if len(cc) == 1:
        print("{} {} network is connected".format(period, net_type))
    else:
        # Compute the ratio between the size of the largest cc and the sum of all the cc sizes
        gc_ratio = int(100*cc[0]["size"]/sum([cc[i]["size"] for i in range(len(cc))]))
        # Print results
        print("{} {} network consists in {} connected components".format(period, net_type, len(cc)))
        print("The largest cc corresponds to {}% of total".format(gc_ratio))

2017 hashtags network consists in 217 connected components
The largest cc corresponds to 77% of total
2018 hashtags network consists in 207 connected components
The largest cc corresponds to 86% of total
2019 hashtags network consists in 189 connected components
The largest cc corresponds to 88% of total


In [8]:
# Keep only the largest connected components
for period in networks.keys():
    # Get the largest connected component
    lcc = networks[period].get_connected_components()[0]["component"]
    # Project the network on the lcc
    networks[period] = networks[period].project_component(lcc)

## Community detection

In [9]:
def getCommunitiesLouvain(network, resolution=1.0, threshold = 100):
    """
    Filter communities with less than threshold hashtags
    return: 
        comm: map community_id -> list of hashtags
        partitions: map hashtag -> community_id
    """ 
    # compute best partitions (fixed random state for reproducibility)
    partition = community.best_partition(graph=network, weight='weight', resolution=resolution, random_state=100)
    size = float(len(set(partition.values())))
    print('There are {} communities'.format(size))
    
    # partition is a dictionary in the form {'hashtag':community_id, ...}
    # we want ot transform it in the form {'community_id':[hashtag1, hashtag2, ...]}
    communities = {}
    for p in partition:
        if partition[p] in communities:
            communities[partition[p]].append(p)
        else:
            communities[partition[p]] = [p]

    # delete small communities (size(community)<threshold)
    communities = {k:v for k, v in communities.items() if len(v)>threshold}
    print('-> {} communities remaining after filtering'.format(len(communities)))
    return communities, partition

In [11]:
communities = {
    2017: None,
    2018: None,
    2019: None
}

parameters = {
    2017: {'resolution':1., 'threshold':100},
    2018: {'resolution':0.9, 'threshold':100},
    2019: {'resolution':0.9, 'threshold':100},
}

for period in networks.keys():
    print("--- Year: {} ---".format(period))
    comm, partitions = getCommunitiesLouvain(
        networks[period].net,
        resolution=parameters[period]['resolution'],
        threshold=parameters[period]['threshold']
    )
    communities[period] = {
        'communities': comm,
        'partitions': partitions
    }
    print()

--- Year: 2017 ---
There are 30.0 communities
-> 6 communities remaining after filtering

--- Year: 2018 ---
There are 32.0 communities
-> 19 communities remaining after filtering

--- Year: 2019 ---
There are 33.0 communities
-> 19 communities remaining after filtering



In [12]:
def sort_dict(d, descending=True):
    """
    Sort a dictionary based on items
    """
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

def printCommunitiElemnts(
    network, communities, num_communities=5,
    community_id=None, metric_name='centrality',
    showTopK=5, print_metric=True
):
    """
    Print most important nodes based on certain metric
    """
    if metric_name=='centrality':
        centrality = nx.degree_centrality(network)
        metric = centrality
        
    if metric_name=='degree':
        degree = nx.degree(network, weight='weight')
        metric = degree
        
    # find most relevant (based on metric) terms for each community
    if community_id == None:
        counter = 1
        for k,v in communities.items():
            print('---Community: {}, Size: {}---'.format(k, len(v)))
            tag_dict = {tag: metric[tag] for tag in v}
            tag_dict = sort_dict(tag_dict)
            tag_list = list(tag_dict.keys())
            for tag in tag_list[:showTopK]:
                if print_metric:
                    print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
                else:
                    print(tag)
            print()
            ## Show only num_communities
            if counter == num_communities:
                break
            counter += 1
    else:
        k = community_id
        if k not in communities:
            print("Community {} has been discarded (too small)".format(k))
            return
        v = communities[community_id]
        print('---Community: {}---'.format(k))
        tag_dict = {tag: metric[tag] for tag in v}
        tag_dict = sort_dict(tag_dict)
        tag_list = list(tag_dict.keys())
        for tag in tag_list[:showTopK]:
            if print_metric:
                print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
            else:
                print(tag)
        print()

In [13]:
year = 2019

printCommunitiElemnts(
    networks[year].net, communities[year]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=False, showTopK=7, community_id=None
)

---Community: 1, Size: 146---
#business
#oil
#manufacturing
#csr
#cars
#electricvehicles
#emissions

---Community: 2, Size: 289---
#energy
#climatestrike
#greennewdeal
#renewables
#solar
#renewableenergy
#actonclimate

---Community: 3, Size: 226---
#sustainable
#recycling
#recycle
#globalgoals
#australia
#eco
#reuse

---Community: 4, Size: 242---
#nature
#pollution
#conservation
#usa
#savetheplanet
#oceans
#trees

---Community: 5, Size: 215---
#health
#ecofriendly
#vegan
#natural
#organic
#thursdaythoughts
#nutrition

---Community: 6, Size: 317---
#innovation
#ai
#technology
#iot
#news
#circulareconomy
#plasticfree

---Community: 7, Size: 217---
#un
#sustainabledevelopment
#india
#peace
#sdg
#nigeria
#unitednations

---Community: 8, Size: 306---
#climateaction
#fridaysforfuture
#climatecrisis
#extinctionrebellion
#climateactionnow
#schoolstrike4climate
#climateemergency

---Community: 9, Size: 150---
#water
#policy
#politics
#us
#california
#drought
#life

---Community: 10, Size: 173--

In [14]:
# inspect community by id
year = 2019

printCommunitiElemnts(
    networks[year].net, communities[year]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=False, showTopK=10, community_id=8
)

---Community: 8---
#climateaction
#fridaysforfuture
#climatecrisis
#extinctionrebellion
#climateactionnow
#schoolstrike4climate
#climateemergency
#gretathunberg
#iwd2019
#fridayforfuture

