# Script to create network communities overlap for different relationships and save core tweets as described in Section 5.3.1.2:

Run all cells to generate the four core tweet files for four overlap types for a single climate event from CrisisMMD dataset and save to disk as csv files. 

##### Note: This step requires to have network communties created using network_community_detection.R script.

## Initialisations:

In [1]:
# Importing Python libraries
import pandas as pd
import numpy as np
import itertools

In [2]:
# Setting paths to required directories on disk

# Set following path to directory that stores the networks created for a climate event specified
network_communities_path = '../../Data/Communities'

# Set following path to directory that stores datasets created using create_dataset.ipynb
dataset_store_path = '../../Data/TweetCredibilityDatasets'

# Set following path to directory to store tweet overlaps
overlap_analysis_path = '../../Data/Overlaps'

Following is the list of climate event names as per the files stored in annotations folder of CrisisMMD dataset. Set the event_name in next cell for specifying the dataset for which the networks are to be created.

1. 'california_wildfires'
2. 'hurricane_harvey'
3. 'hurricane_irma'
4. 'hurricane_maria'
5. 'iraq_iran_earthquake'
6. 'mexico_earthquake'
7. 'srilanka_floods'

In [3]:
# Set the event name of climate event for which the networks are to be generated
event_name = 'california_wildfires'

# Setting community detection algorithm names
community_algorithms = ['louvain', 'infomap', 'walktrap']

## Reading the communities generated by the three community detection algorithms:

In [4]:
# Method to load communities file for the four networks from disk.
def read_communities(algorithm, relationship):
    return pd.read_csv(f'{network_communities_path}/{event_name}_{algorithm}_isolates_{relationship}.csv', names=
                       ['cluster'], header=0, index_col=0).values

In [5]:
# Reading communities generated by louvain algorithm

louvain_author = read_communities('louvain', 'author')
# val[0] because louvain_author read from file has the communities stored as lists of single comma separated string 
# converting val[0] to str for cases with single tweet id in a cluster, in which case the id is saved as int64
louvain_author = [str(val[0]).split(',') for val in louvain_author.tolist()]

louvain_url = read_communities('louvain', 'urls')
louvain_url = [str(val[0]).split(',') for val in louvain_url.tolist()]

louvain_retweet_count = read_communities('louvain', 'retweet_count')
louvain_retweet_count = [str(val[0]).split(',') for val in louvain_retweet_count.tolist()]

louvain_followers = read_communities('louvain', 'followers')
louvain_followers = [str(val[0]).split(',') for val in louvain_followers.tolist()]

In [6]:
print(f'Author: {len(louvain_author)}')
print(f'URL: {len(louvain_url)}')
print(f'Retweet Counts: {len(louvain_retweet_count)}')
print(f'Follower Counts: {len(louvain_followers)}')

Author: 94
URL: 11
Retweet Counts: 13
Follower Counts: 9


In [7]:
# Reading communities generated by infomap algorithm

infomap_author = read_communities('infomap', 'author')
infomap_author = [str(val[0]).split(',') for val in infomap_author.tolist()]

infomap_url = read_communities('infomap', 'urls')
infomap_url = [str(val[0]).split(',') for val in infomap_url.tolist()]

infomap_retweet_count = read_communities('infomap', 'retweet_count')
infomap_retweet_count = [str(val[0]).split(',') for val in infomap_retweet_count.tolist()]

infomap_followers = read_communities('infomap', 'followers')
infomap_followers = [str(val[0]).split(',') for val in infomap_followers.tolist()]

In [8]:
print(f'Author: {len(infomap_author)}')
print(f'URL: {len(infomap_url)}')
print(f'Retweet Counts: {len(infomap_retweet_count)}')
print(f'Follower Counts: {len(infomap_followers)}')

Author: 94
URL: 11
Retweet Counts: 13
Follower Counts: 9


In [9]:
# Reading communities generated by walktrap algorithm

walktrap_author = read_communities('walktrap', 'author')
walktrap_author = [str(val[0]).split(',') for val in walktrap_author.tolist()]

walktrap_url = read_communities('walktrap', 'urls')
walktrap_url = [str(val[0]).split(',') for val in walktrap_url.tolist()]

walktrap_retweet_count = read_communities('walktrap', 'retweet_count')
walktrap_retweet_count = [str(val[0]).split(',') for val in walktrap_retweet_count.tolist()]

walktrap_followers = read_communities('walktrap', 'followers')
walktrap_followers = [val[0].split(',') for val in walktrap_followers.tolist()]

In [10]:
print(f'Author: {len(walktrap_author)}')
print(f'URL: {len(walktrap_url)}')
print(f'Retweet Counts: {len(walktrap_retweet_count)}')
print(f'Follower Counts: {len(walktrap_followers)}')

Author: 94
URL: 11
Retweet Counts: 13
Follower Counts: 9


> A closer look at communities generated by all three algorithms shows that all algorithms are forming the same communities, hence moving forward only louvain communtities are used as discussed in Section 5.3.1.2.

In [11]:
# Converting the tweet ids to integers for all communities of all four networks.
louvain_author = [[int(val) for val in community] for community in louvain_author]
louvain_url = [[int(val) for val in community] for community in louvain_url]
louvain_retweet_count = [[int(val) for val in community] for community in louvain_retweet_count]
louvain_followers = [[int(val) for val in community] for community in louvain_followers]

## Method to Get Overlapping Communities:

In [12]:
# Method to extract common tweets from pairwise comparison 
# of all communities in two networks at a time
def get_overlap(communities_1, communities_2):
    all_matches = []
    for community_1 in communities_1:
        for community_2 in communities_2:
            matches = list(set(community_1) & set(community_2))
            if len(matches) > 0:
                all_matches.append(matches)
    return all_matches

In [13]:
# Setting the names of overlaps required as discussed in Section 5.3.1.2
overlap_names = ['author_url_retweets', 'followers_url_retweets', 'author_followers_retweets', 'author_url_followers']

# Reading tweets data from excel files created using create_dataset.ipynb 
# to be used to get tweets data for tweets extracted in overlaps and save 
# the core tweets with their complete data in overlap tweet files
tweets_data = pd.read_csv(f'{dataset_store_path}/21237189_{event_name}_final_data.csv')    
# Removing duplicate rows
tweets_data = tweets_data.copy().drop_duplicates(subset=['id']).reset_index()

In [14]:
# Method to generate and save all four overlaps
def generate_overlaps(overlap_name):
    
    print(f'\nGetting {overlap_name} overlapping community tweets')
    
    if overlap_name == 'author_url_retweets':
        author_url_overlap = get_overlap(louvain_author, louvain_url)
        overlap = get_overlap(author_url_overlap, louvain_retweet_count)
    elif overlap_name == 'followers_url_retweets':
        followers_url_overlap = get_overlap(louvain_followers, louvain_url)
        overlap = get_overlap(followers_url_overlap, louvain_retweet_count)
    elif overlap_name == 'author_followers_retweets':
        author_follower_overlap = get_overlap(louvain_followers, louvain_author)
        overlap = get_overlap(author_follower_overlap, louvain_retweet_count)
    elif overlap_name == 'author_url_followers': 
        author_url_overlap = get_overlap(louvain_author, louvain_url)
        overlap = get_overlap(author_url_overlap, louvain_followers)
    
    # Flattening the lists of lists obtained into a single list of core tweets
    overlap_tweet_ids = list(itertools.chain(*overlap))
    
    # Saving the core tweets for overlap to disk
    tweets_filtered = tweets_data[tweets_data['id'].isin(list(overlap_tweet_ids))].copy()
    tweets_filtered.to_csv(f'{overlap_analysis_path}/{event_name}_{overlap_name}_tweets.csv', index=0)
    print(f'Core tweets saved for {overlap_name}.')

In [15]:
# Extracting community overlaps for all combinations of network relationships
for overlap_name in overlap_names:
    generate_overlaps(overlap_name)


Getting author_url_retweets overlapping community tweets
Core tweets saved for author_url_retweets.

Getting followers_url_retweets overlapping community tweets
Core tweets saved for followers_url_retweets.

Getting author_followers_retweets overlapping community tweets
Core tweets saved for author_followers_retweets.

Getting author_url_followers overlapping community tweets
Core tweets saved for author_url_followers.
