In [None]:
import collections
import csv
import math
import os
import os.path

import community
from fa2 import ForceAtlas2
import networkx as nx
import nltk
from nltk import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud

In [None]:
g = nx.Graph()

In [None]:
with open("./user_to_friend_screen_names.csv", newline="") as f:
    csv_reader = csv.DictReader(f)
    screen_name_to_friends = {row["screen_name"]: row["friend_screen_names"].split("|") for row in csv_reader}

In [None]:
for screen_name, friends_list in screen_name_to_friends.items():
    for friend in friends_list:
        if friend in screen_name_to_friends and screen_name in screen_name_to_friends[friend]:
            g.add_edge(screen_name, friend)

In [None]:
# Soeren says it is very understandable that these peeps have high degree. He knows some of them :O amazing..
sorted(g.degree, key=lambda x: x[1], reverse=True)[:10]

In [None]:
node_sizes = [d for __, d in g.degree]

In [None]:
fig = plt.figure(figsize=(20, 10))
nx.draw_networkx(g, node_size=node_sizes, with_labels=False, width=0.1)
plt.title("Security People Network")
plt.axis('off')
fig.show()

In [None]:
forceatlas2 = ForceAtlas2(# Behavior alternatives
                          outboundAttractionDistribution=True,  # Dissuade hubs
                          edgeWeightInfluence=0.5,
                          # Performance
                          jitterTolerance=0.2, # Tolerance
                          barnesHutOptimize=True,
                          barnesHutTheta=0.6,
                          # Tuning
                          scalingRatio=0.0,
                          strongGravityMode=False,
                          gravity=1,
                          # Log
                          verbose=True)

positions = forceatlas2.forceatlas2_networkx_layout(g, pos=None, iterations=1000)

In [None]:
fig = plt.figure(figsize=(20, 10))
nx.draw_networkx_nodes(g, positions, node_size=node_sizes, alpha=0.4)
nx.draw_networkx_edges(g, positions, edge_color="black", alpha=0.05, width=0.5)
plt.title("Security People Network")
plt.axis('off')
fig.show()

In [None]:
print(g.number_of_nodes())
print(g.number_of_edges())

In [None]:
def communities(graph):
    partition = community.best_partition(graph)
    d = collections.defaultdict(list)
    # the community.best_partition function maps nodes to a community number, below we map 
    for com in set(partition.values()):
        for nodes in partition.keys():
            if partition[nodes] == com:
                d[com].append(nodes)
    
    return list(d.values())

In [None]:
# all communities - there are lot, so we gonna filter out some of them
security_communities = communities(g)

In [None]:
print(f"The number of security communities: {len(security_communities)}")

In [None]:
hist, bin_edges = np.histogram(list(len(com) for com in security_communities))
center = ((bin_edges[:-1] + bin_edges[1:]) / 2).round()
fig = plt.figure(figsize=(20, 10))
plt.bar(center, hist)
plt.title("Security community sizes")
plt.ylabel("Count")
plt.xlabel("Community size")
plt.xticks(center)
fig.show()

In [None]:
top_5_largest_communites = sorted(security_communities, key=len, reverse=True)[:5]

In [None]:
with open("communities.csv", "w", newline="") as f:
    header = ["community_name", "members"]
    csv_writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    csv_writer.writerow(header)

    for i, com in enumerate(top_5_largest_communites):
        csv_writer.writerow([i, "|".join(com)])

In [None]:
bio_by_name = {}

In [None]:
with open("bios.csv", newline="") as f:
    csv_reader = csv.DictReader(f)
    bio_by_name = {row["screen_name"]: row["bio"] for row in csv_reader}

# Communities!

In [None]:
bios_by_community = {i: [bio_by_name.get(name, "") for name in members] for i, members in enumerate(top_5_largest_communites)}

In [None]:
# all this beautiful code is from Assignment 2

def bag_of_words(document):
    """bag_of_words returns a list of tokens of the document."""
    tokenizer = nltk.RegexpTokenizer(r"\w+(?:-\w+)*")
    tokens = tokenizer.tokenize(document)
    words = [w.lower() for w in tokens if w.isalpha()]
    
    wnl = nltk.WordNetLemmatizer()
    lemma = [wnl.lemmatize(w) for w in words]

    return lemma


def number_of_words(bag_of_words, unique_words):
    """number_of_words return a dict where the key is a word in the bag_of_words and value is the number of times it appears.

    The unique_words is all unique words in the corpus.
    """
    num_words = dict.fromkeys(unique_words, 0)
    for word in bag_of_words:
        num_words[word] += 1
    
    return collections.Counter(num_words)


def compute_TF(number_of_words, bag_of_words):
    """compute_TF returns the computed Term Frequency weight as a dict with words as keys and the weight as values."""
    tf_dict = collections.Counter()
    bag_of_words_count = len(bag_of_words)
    for word, count in number_of_words.items():
        tf_dict[word] = count # / float(bag_of_words_count)  # without / float this function is the same as number_of_words = raw count
    return tf_dict


def compute_IDF(documents):
    """compute_IDF returns the computed Inverse Document Frequency weight as a dict with words as keys and the weight as values.
    
    Its weighting scheme: inverse document frequency log(N/nt)
    """
    N = len(documents)
    
    idf_dict = collections.Counter()
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idf_dict[word] += 1
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))
    
    return idf_dict


def compute_TFIDF(tfs, idfs):
    """compute_TFIDF returns the computer Term Frequency - Inverse Document Frequency weight as a dict of words as keys and weight as values."""
    tf_idf = collections.Counter()
    for word, val in tfs.items():
        tf_idf[word] = val * idfs[word]
    return tf_idf

In [None]:
# list of bags - one document = one bag. Each community is one document.
list_bag_of_words = []
for bios in bios_by_community.values():
    list_bag_of_words.append(bag_of_words(" ".join(bios)))


# get all the unique words across all documents.
unique_words = {word for bag in list_bag_of_words for word in bag}

In [None]:
tfs = []
documents = []
for bag in list_bag_of_words:
    n = number_of_words(bag, unique_words)
    documents.append(n)
    tfs.append(compute_TF(n, bag))

In [None]:
# we calculate the idfs for all the documents in our corpus.
idfs = compute_IDF(documents)

In [None]:
tfidfs = []
for tf in tfs:
    tfidfs.append(compute_TFIDF(tf, idfs))

In [None]:
wordcloud = WordCloud(
    max_words=100,
    collocations=False,
)

In [None]:
fig, axs = plt.subplots(nrows=len(tfidfs), ncols=1, figsize=(20,20))
for i, tfidf in enumerate(tfidfs):
    wordcloud.generate_from_frequencies(tfidf)
    axs[i].set_title(f"Community {i+1}")
    axs[i].imshow(wordcloud, interpolation="bilinear")
    axs[i].axis("off")


fig.show()

# Sentimentality

Are security people sentitive? Lets find out!

In [None]:
# First download the dataset with happiness average

url = "https://ndownloader.figstatic.com/files/360592"
words_of_happiness = pd.read_csv(url, delimiter="\t", skiprows=3)

In [None]:
# This lovely code is from Assignment 2
def compute_average_sentiment(tokens):
    """compute_average_sentiment returns the average sentiment value of the tokens.
    
    Each token in tokens must be in lowercase.
    """
    sentiment = 0.0
    if not len(tokens):
        return sentiment

    avg = np.nan_to_num(words_of_happiness[words_of_happiness["word"].isin(tokens)]["happiness_average"].mean())
    return avg

In [None]:
communities = {i: set(members) for i, members in enumerate(top_5_largest_communites)}

In [None]:
text_of_communities = collections.defaultdict(str)

In [None]:
with open("sentiment_tweets.csv", newline="") as f:
    csv_reader = csv.DictReader(f)
    tweets_by_screen_name = collections.defaultdict(list)
    for row in csv_reader:
        for i, members in communities.items():
            if row["screen_name"] in members:
                text_of_communities[i] += f" {row['tweets']}"

In [None]:
sentiment_of_communities = {k: compute_average_sentiment(bag_of_words(v)) for k, v in text_of_communities.items()}

In [None]:
for com, sentiment in sentiment_of_communities.items():
    print(f"Community {com} have a sentiment value of {sentiment}")