In [17]:
import csv
import collections
from functools import wraps
import os
import os.path
import re
import time

import community
import networkx as nx
import tweepy

In [2]:
twitter_consumer = os.environ["TWITTER_CONSUMER"]
twitter_consumer_secret = os.environ["TWITTER_CONSUMER_SECRET"]
twitter_token = os.environ["TWITTER_TOKEN"]
twitter_token_secret = os.environ["TWITTER_TOKEN_SECRET"]

auth = tweepy.OAuthHandler(twitter_consumer, twitter_consumer_secret)
auth.set_access_token(twitter_token, twitter_token_secret)
api = tweepy.API(auth)

In [11]:
tweets_filename = "tweets.csv"
id_to_screen_name_filename = "id_to_screen_name.csv"
user_and_friends_filename = "user_and_friends_ids.csv"
user_to_friend_filename = "user_to_friend_screen_names.csv"
bios_filename = "bios.csv"
sentiment_tweets_filename = "sentiment_tweets.csv"
communities_filename = "communities.csv"
top_5_communities_filename = "top_5_communities.csv"
graph_filename = "security_network.gml"

In [None]:
# Load in all known screen names so we can use it as a filter.
if os.path.exists(tweets_filename):
    with open(tweets_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        known_screen_names = [row["screen_name"] for row in csv_reader]

In [None]:
query = "(infosec OR cve OR cybersec OR cybersecurity OR ransomware)"
twitter_filter = "-filter:retweets"
mininum_favorites = "min_faves:10"

pattern = re.compile(r"@\w+", re.UNICODE | re.MULTILINE)


with  open(tweets_filename, "a", newline="") as twitter_file:
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    header = ["screen_name", "content", "mentions"]
    csv_writer.writerow(header)
    # fetch 100 pages with 100 tweets per page.
    for public_tweets in tweepy.Cursor(api.search, q=f"{query} {twitter_filter} {mininum_favorites}", count=100).pages(100):
        for tweet in public_tweets:
            screen_name = f"@{tweet.user.screen_name}"
            # skip screen names we have seen before.
            if screen_name in known_screen_names:
                continue
            mentions = pattern.findall(tweet.text)
            csv_writer.writerow([screen_name, tweet.text.replace("\n", "\\n"), "|".join(mentions)])

In [4]:
def retry(func=None, wait=900):
    """retry retries the function after the wait period on a RateLimitError.
    
    All other errors are raised."""
    def decorator_retry(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            while True:
                try:
                    return func(*args, **kwargs)
                except tweepy.RateLimitError:
                    print(f"sleeping for {wait/60}min", flush=True)
                    time.sleep(wait)
                except Exception:
                    raise
        return wrapper


    if func is not None:
        return decorator_retry(func)

    return decorator_retry

In [5]:
def log(msg, filename):
    """log logs the message to the given filename.
    
    It will append the message to an existing file."""
    with open(filename, "a") as f:
        f.write(msg)

In [None]:
names = set()
with open(tweets_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    for row in csv_reader:
        names.add(row["screen_name"])
        for mention in row["mentions"].split("|"):
            names.add(mention)

# remove empty screen name
names.remove("")

In [None]:
friend_list = {}

In [None]:
# read in all known data about screen name and their friends
if os.path.exists(user_and_friends_filename):
    with open(user_and_friends_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        friend_list = {row["screen_name"]: row["friends_ids"] for row in csv_reader}

In [None]:
@retry
def friends_ids(name):
    return {name: api.friends_ids(name, count=5000)}

print("Extract friends ids", flush=True)
for idx, name in enumerate(names):
    if idx % 1000 == 0:
        print(str(idx) + " number of name processed", flush=True)
    # if the name is already in the list, we continue
    if name in friend_list:
        continue
    
    try:
        val = friends_ids(name)
    except Exception as e:
        log(str(e), "friends_ids.log")
    else:
        friend_list.update(val)

In [None]:
with open(user_and_friends_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "friends_ids"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, friends_ids in friend_list.items():
        # discard twitter profiles with over 5000 friends - no one can have that many friends!
        if len(friends_ids) == 5000:
            continue
        csv_writer.writerow([screen_name, "|".join(str(id_) for id_ in friends_ids)])

In [None]:
unique_friend_ids = list(set(id_ for ids in friend_list.values() for id_ in ids))

In [None]:
screen_names = {}

In [None]:
# read in all known data of id to screen_names
if os.path.exists(id_to_screen_name_filename):
    with open(id_to_screen_name_filename, newline="") as twitter_file:
        csv_reader = csv.DictReader(twitter_file)
        screen_names = {row["id"]: row["screen_name"] for row in csv_reader}

In [None]:
for key in screen_names:
    try:
        # remove all known ids
        unique_friend_ids.remove(key)
    except ValueError:
        pass

In [None]:
@retry
def lookup_users(ids):
    return api.lookup_users(ids)

print("Extract Users from friends ids", flush=True)
for i in range(100, len(unique_friend_ids), 100):
    if i % 1000 == 0:
        print(str(i) + " number of id processed", flush=True)
    try:
        users = lookup_users(unique_friend_ids[i-100:i])
    except Exception as e:
        log(str(e), "lookup_users.log")
    else:
        screen_names.update({user.id: user.screen_name for user in users})

In [None]:
with open(id_to_screen_name_filename, "w", newline="") as twitter_file:
    header = ["id", "screen_name"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for k, v in screen_names.items():
        csv_writer.writerow([k, v])

In [None]:
friendly_friend_list = {screen_name: [f"@{screen_names.get(id_)}" for id_ in ids] for screen_name, ids in friend_list.items()}

In [None]:
with open(user_to_friend_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "friend_screen_names"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for k, v in friendly_friend_list.items():
        csv_writer.writerow([k, "|".join(v)])

In [7]:
g = nx.Graph()

In [8]:
with open(user_to_friend_filename, newline="") as f:
    csv_reader = csv.DictReader(f)
    screen_name_to_friends = {row["screen_name"]: row["friend_screen_names"].split("|") for row in csv_reader}

In [9]:
for screen_name, friends_list in screen_name_to_friends.items():
    for friend in friends_list:
        if friend in screen_name_to_friends and screen_name in screen_name_to_friends[friend]:
            g.add_edge(screen_name, friend)

In [12]:
nx.write_gml(g, graph_filename)

In [13]:
def communities(graph):
    partition = community.best_partition(graph)
    d = collections.defaultdict(list)
    # the community.best_partition function maps nodes to a community number, below we map 
    for com in set(partition.values()):
        for nodes in partition.keys():
            if partition[nodes] == com:
                d[com].append(nodes)
    
    return list(d.values())

In [35]:
security_communities = communities(g)

In [55]:
with open(communities_filename, "w", newline="") as f:
    header = ["community_name", "members"]
    csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)

    for i, com in enumerate(security_communities):
        csv_writer.writerow([i, "|".join(com)])

In [36]:
top_5_largest_communites = sorted(security_communities, key=len, reverse=True)[:5]

In [37]:
with open(top_5_communities_filename, "w", newline="") as f:
    header = ["community_name", "members"]
    csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)

    for i, com in enumerate(top_5_largest_communites):
        csv_writer.writerow([i, "|".join(com)])

In [38]:
members_by_communities = {}

In [39]:
with open(top_5_communities_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    members_by_communities = {row["community_name"]: row["members"].split("|") for row in csv_reader}

In [56]:
bio_by_name = {}

In [41]:
with open(bios_filename, newline="") as f:
    csv_reader = csv.DictReader(f)
    bio_by_name = {row["screen_name"]: (row["bio"], row["location"]) for row in csv_reader}

In [57]:
@retry
def get_user(member):
    return api.get_user(member)

for members in members_by_communities.values():
    for member in members:
        if member in bio_by_name:
            continue
        try:
            user = get_user(member)
        except tweepy.TweepError as e:
            print(f"This member: {member} caused an error! Shame on thee {e}")
        else:
            bio_by_name[member] = (user.description, user.location)

This member: @akolsuoicauqol caused an error! Shame on thee [{'code': 50, 'message': 'User not found.'}]
This member: @ColoradoWinds caused an error! Shame on thee [{'code': 63, 'message': 'User has been suspended.'}]
This member: @HormetcAesthetc caused an error! Shame on thee [{'code': 50, 'message': 'User not found.'}]
sleeping for 15.0min
This member: @Michael27588252 caused an error! Shame on thee [{'code': 50, 'message': 'User not found.'}]
This member: @twirlinggoddess caused an error! Shame on thee [{'code': 50, 'message': 'User not found.'}]


In [58]:
with open(bios_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "bio", "location"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, (bio, location) in bio_by_name.items():
        csv_writer.writerow([screen_name, bio.replace("\n", "\\n"), location])

In [51]:
tweets_by_screen_name = collections.defaultdict(list)

In [52]:
with open(sentiment_tweets_filename, newline="") as twitter_file:
    csv_reader = csv.DictReader(twitter_file)
    for row in csv_reader:
        tweets_by_screen_name[row["screen_name"]].append(row["tweets"])

In [53]:
@retry
def get_user_timeline(member):
    return api.user_timeline(member)

for members in members_by_communities.values():
    for member in members:
        if member in tweets_by_screen_name:
            continue
        try:
            statuses = get_user_timeline(member)
        except tweepy.TweepError as e:
            print(f"This member: {member} caused an error! Shame on thee {e}")
        else:
            tweets_by_screen_name[member] = [status.text for status in statuses]

This member: @ra6bit caused an error! Shame on thee Not authorized.
This member: @ColoradoWinds caused an error! Shame on thee Not authorized.
This member: @ZyzzRespecter caused an error! Shame on thee Not authorized.
This member: @Michael27588252 caused an error! Shame on thee [{'code': 34, 'message': 'Sorry, that page does not exist.'}]


In [54]:
with open(sentiment_tweets_filename, "w", newline="") as twitter_file:
    header = ["screen_name", "tweets"]
    csv_writer = csv.writer(twitter_file, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)
    for screen_name, tweets in tweets_by_screen_name.items():
        for tweet in tweets:
            csv_writer.writerow([screen_name, tweet.replace("\n", "\\n")])