# Working with data

This notebook contains all the code for extracting from Twitter. Furthermore, it contains code for creating a graph from the extracted data and finding communites from that graph.  
Data, graphs, and communities are all saved to disk.

The [explainer.ipynb][1] loads the saved data and visualises it.

[1]: https://nbviewer.jupyter.org/github/Glorforidor/SocialGraphAssignments/blob/master/explainer.ipynb

---

Load in necessary libraries for extracting and working with data, graphs, and communities.

In [None]:
# Standard libraries
import csv
import collections
from functools import wraps
import os
import os.path
import re
import time

# Third party libraries
import community
import networkx as nx
import tweepy

---

Names of files that will contain data.

In [None]:
# Filenames of all the data files which makes up our dataset.
tweets_filename = "tweets.csv"
id_to_screen_name_filename = "id_to_screen_name.csv"
user_and_friends_filename = "user_and_friends_ids.csv"
user_to_friend_filename = "user_to_friend_screen_names.csv"
bios_filename = "bios.csv"
sentiment_tweets_filename = "sentiment_tweets.csv"
communities_filename = "communities.csv"
top_5_communities_filename = "top_5_communities.csv"

# The saved graph - it is an undirected graph.
graph_filename = "security_network.gml"

---

Helper functions.

Twitter API have rate limit on X request per 15 minutes.  
The retry function is used to wrap tweepy calls and each time a RateLimitError is raised, we wait 15 minutes and retry again the call.

The log function is mostly used to log any errors that is not an RateLimitError.

In [None]:
def retry(func=None, wait=900):
    """retry retries the function after the wait period on a RateLimitError.
    
    All other errors are raised."""
    def decorator_retry(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            while True:
                try:
                    return func(*args, **kwargs)
                except tweepy.RateLimitError:
                    print(f"sleeping for {wait/60}min", flush=True)
                    time.sleep(wait)
                except Exception:
                    # Raise any other error back to the caller.
                    raise
        return wrapper


    if func is not None:
        return decorator_retry(func)

    return decorator_retry

In [None]:
def log(msg, filename):
    """log logs the message to the given filename.
    
    It will append the message to an existing file."""
    with open(filename, "a") as f:
        f.write(msg)

## Twitter scraping

---

Setup Tweepy library to authenticate with Twitter API.

In [None]:
# Get the Twitter tokens from the environment. 
twitter_consumer = os.environ["TWITTER_CONSUMER"]
twitter_consumer_secret = os.environ["TWITTER_CONSUMER_SECRET"]
twitter_token = os.environ["TWITTER_TOKEN"]
twitter_token_secret = os.environ["TWITTER_TOKEN_SECRET"]

# Use the Twitter tokens to authenticate towards Twitter.
auth = tweepy.OAuthHandler(twitter_consumer, twitter_consumer_secret)
auth.set_access_token(twitter_token, twitter_token_secret)
api = tweepy.API(auth)

---

Extracting users and tweets that contain key words related to security, such as infosec.

In [None]:
# Load in all known screen names so we can use them as a filter.
if os.path.exists(tweets_filename):
    with open(tweets_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        known_screen_names = [row["screen_name"] for row in csv_reader]

In [None]:
# Construct the search query for the Twitter API.
query = "(infosec OR cve OR cybersec OR cybersecurity OR ransomware)"  # Match any words in the query string.
twitter_filter = "-filter:retweets"  # Filter out retweets.
mininum_favorites = "min_faves:10"  # Only fetch tweets that have at least 10 likes.

# A regex pattern to find user names in a tweet.
pattern = re.compile(r"@\w+", re.UNICODE | re.MULTILINE)


# Append new tweets if found.
# The Twitter API, for a standard user, a limit of retrieving tweets up to 7 days in the past.
with  open(tweets_filename, "a", newline="") as twitter_file:
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    header = ["screen_name", "content", "mentions"]
    csv_writer.writerow(header)
    # Fetch 100 pages with 100 tweets per page.
    for public_tweets in tweepy.Cursor(api.search, q=f"{query} {twitter_filter} {mininum_favorites}", count=100).pages(100):
        for tweet in public_tweets:
            screen_name = f"@{tweet.user.screen_name}"
            # Skip screen names we have seen before.
            if screen_name in known_screen_names:
                continue
            # If the tweet text mentions someone extract that screenname.
            mentions = pattern.findall(tweet.text)
            # In the a user's tweet, there can be newlines which will mess up the csv file.
            # Therefore, the newlines are escaped.
            csv_writer.writerow([screen_name, tweet.text.replace("\n", "\\n"), "|".join(mentions)])

---

From the users extracted before, we search their profiles for all their friends' ids.

In [None]:
# Load all the screen names and store them in a set to remove duplicates.
screen_names = set()
with open(tweets_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    for row in csv_reader:
        screen_names.add(row["screen_name"])
        for mention in row["mentions"].split("|"):
            screen_names.add(mention)

# Remove empty screen name.
screen_names.remove("")

In [None]:
# Map screen name to friend ids of a Twitter user.
friend_by_screen_name = {}

In [None]:
if os.path.exists(user_and_friends_filename):
    with open(user_and_friends_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        friend_by_screen_name = {row["screen_name"]: row["friends_ids"] for row in csv_reader}

In [None]:
@retry
def friends_ids(screen_name):
    """friends_ids fetches all friend ids of the given screen name"""
    return {name: api.friends_ids(screen_name, count=5000)}

print("Extract friends ids")
for idx, screen_name in enumerate(screen_names):
    if idx % 1000 == 0:
        print(str(idx) + " number of name processed")
    # If the name is already in the list then continue.
    if screen_name in friend_by_screen_name:
        continue
    
    try:
        val = friends_ids(screen_name)
    except Exception as e:
        log(str(e), "friends_ids.log")
    else:
        friend_by_screen_name.update(val)

In [None]:
with open(user_and_friends_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "friends_ids"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, friends_ids in friend_list.items():
        # Discard twitter profiles with over 5000 friends - no one can have that many friends!
        if len(friends_ids) == 5000:
            continue
        csv_writer.writerow([screen_name, "|".join(str(id_) for id_ in friends_ids)])

---

With the Friend ids, extract the name assoicated with that id.

In [None]:
unique_friend_ids = list(set(id_ for ids in friend_list.values() for id_ in ids))

In [None]:
screen_name_by_id = {}

In [None]:
if os.path.exists(id_to_screen_name_filename):
    with open(id_to_screen_name_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        screen_name_by_id = {row["id"]: row["screen_name"] for row in csv_reader}

In [None]:
for id_ in screen_name_by_id:
    try:
        # Remove all known ids.
        unique_friend_ids.remove(id_)
    except ValueError:
        # keep any id that is not in the unique_friend_ids.
        pass

In [None]:
@retry
def lookup_users(ids):
    """lookup_users retrieves users that is assoicated with the given ids"""
    return api.lookup_users(ids)

print("Extract Users from friends ids", flush=True)
for i in range(100, len(unique_friend_ids), 100):
    if i % 1000 == 0:
        print(f"{str(i)} number of id processed")
    try:
        users = lookup_users(unique_friend_ids[i-100:i])
    except Exception as e:
        log(str(e), "lookup_users.log")
    else:
        screen_name_by_id.update({user.id: user.screen_name for user in users})

In [None]:
with open(id_to_screen_name_filename, "w", newline="") as twitter_file:
    header = ["id", "screen_name"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for id_, screen_name in screen_name_by_id.items():
        csv_writer.writerow([id_, screen_name])

---

Create file with a user and their friends' names.

In [None]:
# Map a screen name to list of friends' screen names.
friend_names = {screen_name: [f"@{screen_names.get(id_)}" for id_ in ids] for screen_name, ids in friend_list.items()}

In [None]:
# Write screen name and the friends' screen names down.
with open(user_to_friend_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "friend_screen_names"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, friend_screen_names in friend_names.items():
        csv_writer.writerow([screen_name, "|".join(friend_screen_names)])

## Graph creation

---

Construct the graph and add edges between users that follow each other.

In [None]:
# Create a undirected graph as the repreentation of the Security People Network.
g = nx.Graph()

In [None]:
with open(user_to_friend_filename, newline="") as f:
    csv_reader = csv.DictReader(f)
    screen_name_to_friends = {row["screen_name"]: row["friend_screen_names"].split("|") for row in csv_reader}

In [None]:
# Go through screen names and friend's screen names and add an edge iff both users are friend with each other.
# In the Twitter world that is that they both follow each.
for screen_name, friends_list in screen_name_to_friends.items():
    for friend in friends_list:
        if friend in screen_name_to_friends and screen_name in screen_name_to_friends[friend]:
            g.add_edge(screen_name, friend)

In [None]:
# Save the graph to disk.
nx.write_gml(g, graph_filename)

## Community creation

---

From the graph, find the communities by the best partition.

In [None]:
# This lovely code is from our Assignment 2:
# https://github.com/Glorforidor/SocialGraphAssignments/blob/master/Assignment2.ipynb
# easily viewed here:
# https://nbviewer.jupyter.org/github/Glorforidor/SocialGraphAssignments/blob/master/Assignment2.ipynb

def communities(graph):
    """communities find communities in the graph and return a list of communities.
    
    It uses the community library to find the best partition of the graph using the Louvain method.
    """
    partition = community.best_partition(graph)
    d = collections.defaultdict(list)
    # The community.best_partition function maps nodes to a community number, below via do the opposite.
    for com in set(partition.values()):
        for nodes in partition.keys():
            if partition[nodes] == com:
                d[com].append(nodes)
    
    return list(d.values())

In [None]:
security_communities = communities(g)

In [None]:
with open(communities_filename, "w", newline="") as f:
    header = ["community_name", "members"]
    csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)

    for i, com in enumerate(security_communities):
        csv_writer.writerow([i, "|".join(com)])

---

From the communities, take the top 5 largest communities, so we only work with a subset of the communities.

In [None]:
top_5_largest_communites = sorted(security_communities, key=len, reverse=True)[:5]

In [None]:
with open(top_5_communities_filename, "w", newline="") as f:
    header = ["community_name", "members"]
    csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)

    for i, com in enumerate(top_5_largest_communites):
        csv_writer.writerow([i, "|".join(com)])

In [None]:
members_by_communities = {}

In [None]:
with open(top_5_communities_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    members_by_communities = {row["community_name"]: row["members"].split("|") for row in csv_reader}

## Sentiment data

---

With the top 5 largest communities, extract their description (bio) and their location, to pin point which type of community they belong to and where are they mostly based.

In [None]:
bio_by_name = {}

In [None]:
if os.path.exists(bios_filename):
    with open(bios_filename, newline="") as f:
        csv_reader = csv.DictReader(f)
        bio_by_name = {row["screen_name"]: (row["bio"], row["location"]) for row in csv_reader}

In [None]:
@retry
def get_user(member):
    """get_user fetches a Twitter user.
    
    member: id, user_id or screen_name.
    """
    return api.get_user(member)

for members in members_by_communities.values():
    for member in members:
        if member in bio_by_name:
            continue
        try:
            user = get_user(member)
        except tweepy.TweepError as e:
            print(f"This member: {member} caused an error! Shame on thee {e}")
        else:
            bio_by_name[member] = (user.description, user.location)

In [None]:
with open(bios_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "bio", "location"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, (bio, location) in bio_by_name.items():
        # In the a user's bio, there can be newlines which will mess up the csv file.
        # Therefore, the newlines are escaped.
        csv_writer.writerow([screen_name, bio.replace("\n", "\\n"), location])

---

Extract recent tweets from community members, which will then be used to calculate some sentiment values.

In [None]:
tweets_by_screen_name = collections.defaultdict(list)

In [None]:
with open(sentiment_tweets_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    for row in csv_reader:
        tweets_by_screen_name[row["screen_name"]].append(row["tweets"])

In [None]:
@retry
def get_user_timeline(member):
    """get_user_timeline fetches a Twitter user's timeline.
    
    member: id, user_id or screen_name.
    """
    return api.user_timeline(member)

for members in members_by_communities.values():
    for member in members:
        if member in tweets_by_screen_name:
            continue
        try:
            statuses = get_user_timeline(member)
        except tweepy.TweepError as e:
            print(f"This member: {member} caused an error! Shame on thee {e}")
        else:
            tweets_by_screen_name[member] = [status.text for status in statuses]

In [None]:
with open(sentiment_tweets_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "tweets"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, tweets in tweets_by_screen_name.items():
        for tweet in tweets:
            # In the a user's tweet, there can be newlines which will mess up the csv file.
            # Therefore, the newlines are escaped.
            csv_writer.writerow([screen_name, tweet.replace("\n", "\\n")])