In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from modules.dataset.tweets import Tweets
from modules.dataset.entities import Entities

In [11]:
alpha = 0.9

In [2]:
def get_adjacency_matrix(data):
    """
    Input:
        - data: pandas.DataFrame with columns names = ['index_id', 'index_tag']
    Output:
        - numpy.matrix A - Adjacency matrix
    """
    # Create networkx graph object
    graph = nx.from_pandas_edgelist(data, source='index_id', target='index_tag')
    # Extract adjacency matrix
    A = nx.to_numpy_matrix(graph)

    return A

In [32]:
def get_google_matrix(A, e2i, cluster, alpha):
    """
    Input:
        - A       : numpy.matrix of dimension [n_nodes, n_nodes]
        - i2e     : dictionary that associates each node number to its name
        - cluster : list of strings (node names)
        - alpha   : float between 0 and 1 -- Dumping factor (which is 1 - teleport probability)
    Output:
        - numpy.matrix - Google matrix G = alpha A + (1-alpha) C
    """
    # Normalize A (stochastic on columns)
    A /= A.sum(axis=0)
    # Mask of indices in the cluster
    mask = [ e2i[e] for e in cluster ]
    # Compute google matrix
    G = alpha*A
    G[:, mask] += (1-alpha)/len(cluster)

    return G

In [51]:
def power_iteration(G, max_iter: int, tolerance=1e-3):
    """
    Input:
        - G         : squared numpy.matrix -- Google matrix
        - max_iter  : int -- maximum number of iterations
        - tolerance : float -- maximum accepted error
    Output:
        - approximate eigenvector of G (unique if G is a Google matrix)
    """
    # Choose a random vector to decrease the chance that our vector is orthogonal to the eigenvector
    b_k = np.random.rand(G.shape[1])

    for _ in range(max_iter):
        # Calculate the matrix-by-vector product Ab
        b_k1 = G @ np.reshape(b_k, (-1,1))

        # Calculate the norm
        b_k1_norm = np.linalg.norm(b_k1)

        # Re-normalize the vector
        b_k = b_k1 / b_k1_norm

        # If the precision increment is uniformly lower than the tolerance, break
        if np.allclose(b_k, b_k1, atol=tolerance):
            break

    return b_k

In [3]:
# Load communities data
data_path = "data/communities/"
communities = pd.read_csv(data_path+"hashtags_community_selected.csv", header=0)
communities.head()

Unnamed: 0,hashtag,community,year
0,#1,0,2017
1,#geoengineering,0,2017
2,#srm,0,2017
3,#17goals17days,1,2017
4,#development,1,2017


In [4]:
years = set(communities.year.values)
years

{2017, 2018, 2019}

In [5]:
# Load tweet_id - hashtag map
hashtags = Entities()
hashtags.from_json("data/db/hashtags.json")
hashtags.df = hashtags.df[['tweet_id','entity_text']].rename(columns={'entity_text': 'hashtag'})
hashtags.df.hashtag = hashtags.df.hashtag.apply(lambda x: x.lower())
hashtags.df.head()

Unnamed: 0,tweet_id,hashtag
0,1101574442575167489,#humans
1,1101574442575167489,#climatechange
2,1101574446341607424,#climatechange
3,1101574446341607424,#actonclimate
4,1101574446341607424,#climate


In [6]:
# drop rows with search hashtags
seed_list = ["#climatechange", "#climate", "#sdgs", "#sustainability", "#environment", "#globalwarming"]
hashtags.df = hashtags.df[~hashtags.df.hashtag.isin(seed_list)]
hashtags.df.head()

Unnamed: 0,tweet_id,hashtag
0,1101574442575167489,#humans
3,1101574446341607424,#actonclimate
5,1101574446341607424,#energy
6,1101574446341607424,#climatestrike
7,1101574446341607424,#greennewdeal


In [7]:
# Load tweet data
tweets = Tweets()
tweets.from_json("data/db/tweets.json")
tweets.df.head()

Unnamed: 0,tweet_id,tweet_date,tweet_text
0,836950901495631872,2017-03-01 14:46:59,TT SINGAPORE 22:46\n1.Hong Kong\n2.#JointAddre...
1,836950882528989184,2017-03-01 14:46:54,Letting #snapchat prepare me for the day's uns...
2,836950869639835649,2017-03-01 14:46:51,"""The bill would require the state to get all o..."
3,836950847380668416,2017-03-01 14:46:46,Style-Lead don't Follow #recycledfashion https...
4,836950839101116421,2017-03-01 14:46:44,‘Shell knew’: oil giant's 1991 film warned of ...


In [83]:
# Join with tweet_id
for year in [2017]:
    # Select ids of the year
    curr_ids = list(tweets.df.tweet_id[tweets.df.tweet_date.dt.year == year].values)
    # Select communities of the year
    curr_communities = communities[communities.year == year]
    # Select hashtags of interest
    curr_hashtags = hashtags.df[hashtags.df.tweet_id.isin(curr_ids)]
    # Create edges
    data = curr_communities.merge(curr_hashtags, on="hashtag", how="outer")
    # Drop not-in-cluster hashtags
    data = data.loc[~data.isna().any(axis=1)]
    
    # Map entities in index
    nodes = list(data.hashtag.unique())
    nodes.extend(data.tweet_id.unique())
    e2i = dict(zip(nodes, range(len(nodes))))
    # Map index in entities
    i2e = dict(zip(range(len(nodes)), nodes))
    
    # Add indices to data
    data['index_id'] = data.tweet_id.apply(lambda x: e2i[x])
    data['index_tag'] = data.hashtag.apply(lambda x: e2i[x])
    data = data[['index_id', 'index_tag']]
    # Init metrics container for year
    clusters = curr_communities.community.unique()
    community_similarity = pd.DataFrame(columns=clusters, dtype=float)
    
    # Loop through communities
    for cluster in clusters:
        # Compute adjacency matrix
        A = get_adjacency_matrix(data)
        # Compute Google matrix
        G = get_google_matrix(A, e2i, curr_communities.hashtag[curr_communities.community == cluster], alpha)
        # Compute eigenvector
        v = power_iteration(G, 15)
        # Add eigenvector to metrics container
        community_similarity[cluster] = np.array(v).squeeze()

In [84]:
community_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.013550,0.012325,0.012693,0.013573,0.013210,0.012754,0.013237,0.012953,0.012771,0.012922,...,0.012807,0.013973,0.012541,0.013651,0.013129,0.014197,0.012914,0.012983,0.014254,0.011724
1,0.004678,0.004600,0.004555,0.004797,0.004824,0.004764,0.004122,0.004590,0.004578,0.004098,...,0.004847,0.004752,0.004349,0.004185,0.004767,0.004256,0.004646,0.004635,0.004554,0.004406
2,0.015815,0.014848,0.015374,0.015709,0.015351,0.015087,0.015434,0.015273,0.015208,0.015026,...,0.014999,0.016289,0.014466,0.015673,0.015389,0.016676,0.015064,0.015508,0.016561,0.014833
3,0.015606,0.015332,0.015325,0.015853,0.015577,0.015955,0.014124,0.015074,0.015392,0.013884,...,0.015732,0.016038,0.014944,0.014601,0.015303,0.014209,0.015194,0.015460,0.015791,0.015149
4,0.004692,0.004301,0.004430,0.004725,0.004644,0.004425,0.004542,0.004513,0.004437,0.004408,...,0.004519,0.004784,0.004288,0.004627,0.004646,0.004892,0.004509,0.004525,0.004828,0.004102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,0.008276,0.008341,0.008286,0.008382,0.009011,0.008068,0.008334,0.008429,0.008484,0.008203,...,0.008368,0.008127,0.008017,0.007891,0.008738,0.008424,0.008603,0.008335,0.007838,0.008791
4615,0.003922,0.003843,0.003729,0.003999,0.004138,0.003836,0.003703,0.003979,0.003802,0.003606,...,0.004107,0.003779,0.003627,0.003521,0.004147,0.003801,0.004036,0.003805,0.003475,0.003477
4616,0.003571,0.003475,0.003560,0.003662,0.003786,0.003510,0.003351,0.003609,0.003495,0.003300,...,0.003725,0.003406,0.003346,0.003339,0.003860,0.003460,0.003650,0.003488,0.003166,0.003293
4617,0.012812,0.012906,0.012957,0.012932,0.012997,0.012590,0.012964,0.012829,0.012767,0.012113,...,0.012858,0.013096,0.012796,0.012407,0.012882,0.012410,0.013198,0.012368,0.011954,0.012058
