In [None]:
import json
import re
from collections import Counter
from itertools import chain, combinations
from json import JSONDecodeError

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from community import community_louvain
from matplotlib.ticker import PercentFormatter
from pyvis.network import Network
from tqdm.notebook import tqdm

In [None]:
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("ch:s=2.5,rot=.15"))
cmap = sns.color_palette(("ch:s=2.5,rot=.15"), as_cmap=True)

In [None]:
df = pd.read_parquet("data/slapgate_twitter_mymodel2")

In [None]:
df.iloc[:2, :20]

In [None]:
df.iloc[:5, 20:]

In [None]:
ax = plt.figure(figsize=(15, 6), dpi=90)
ax = sns.countplot(data=df, x="mymodel2_label")
ax.set_xlabel("Emotions")
ax.set_ylabel("Count")
for p in ax.patches:
    ax.annotate("{:>5,d}".format(p.get_height()), (p.get_x() + 0.3, p.get_height() + 100))
ax.xaxis.set_tick_params(which="both", labelleft=True)
ax.set_title("Predictions using 'my distilbert-model2'")
plt.show()

In [None]:
others = df.groupby("source").count().sort_values("_id", ascending=False).index[5:]
df_source = df.replace(others, "Other")
df_src_grp = df_source.groupby(["source", "mymodel2_label"]).count().reset_index().iloc[:, :3]

In [None]:
df_src_grp_sum = df_src_grp.groupby("source").agg(["count", "sum"])
df_src_grp_sum.columns = ["_".join(col) for col in df_src_grp_sum.columns.values]

In [None]:
df_src_grp_t = pd.merge(
    df_src_grp, df_src_grp_sum.reset_index()[["source", "_id_sum"]], on="source"
)
df_src_grp_t["_id_perc"] = df_src_grp_t["_id"] / df_src_grp_t["_id_sum"]

In [None]:
ax = plt.figure(figsize=(15, 6), dpi=90)
ax = sns.barplot(data=df_src_grp_t, x="mymodel2_label", y="_id_perc", hue="source")
ax.set_xlabel("Emotions")
ax.set_ylabel("%-Emotion")
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
ax.set_title("Predicted Sentiment per source")
ax.text(1, 0.45, "Instragram Tweets")
plt.plot([0.7, 0.99], [0.5, 0.455], "k-", lw=1)
plt.plot([1.53, 1.65], [0.455, 0.06], "k-", lw=1)
ax.text(3, 0.21, "Smartphones")
plt.plot([3.15, 3.05], [0.2, 0.06], "k-", lw=1)
plt.plot([3.25, 3.35], [0.2, 0.06], "k-", lw=1)
plt.show()

### Preprocessing

For our preprocessing steps we will correct two aspects. The first is to identify and remove potential spam accounts and the second is to make the tweets' text more machine readable. For the possible spam accounts we remove accounts that have posted an excessive amount of tweets. Below we can see that we have accounts with more 100 tweets in our short time frame. Although some power users might sent this many tweets, I will remove those frequent posters in an attempt to get rid of possible spam tweets posted by (probably) bots. Below are some example spam-tweets. Those 149 accounts are responsible for ~3.500 tweets. More than half of these tweets have been posted via the Web App, which speaks for a possible bot.

In [None]:
df_grouped = df.groupby("author_id").count().sort_values(by="_id", ascending=False)
spammer_ids = df_grouped[df_grouped["_id"] >= 10].reset_index()["author_id"]

In [None]:
df_nobots = df.drop(df[df["author_id"].isin(spammer_ids)].index, axis=0)

Next we will edit the tweets themselves. Many users have used the special character & which is encoded as & and will be converted to the word and. Additionally we remove all '@' and '#' symbols to make the text more readable. The example below shows how many users use hashtags and @ as part of their speech in the tweets. Therefore, completely removing the hashtags would remove some meaning of the origin text. We also remove all hyperlinks from the tweets because those produce many false positives.

In [None]:
def preproc_tweets(txt: str):
    new_txt = txt.replace("&amp;", "and")
    new_txt = new_txt.replace("@", "")
    new_txt = new_txt.replace("#", "")
    new_txt = re.sub(
        r"(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?", "", new_txt
    )
    return new_txt

In [None]:
df["text"].loc[7763:7764].iloc[0]

In [None]:
df["text"].loc[7763:7764].apply(preproc_tweets).iloc[0]

We apply our preprocessing steps on the tweets.

In [None]:
df["text"] = df["text"].apply(preproc_tweets)

## Network Analysis

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
verbs = list()
adj = list()
propn = list()
for idx in tqdm(range(len(df))):
    _id = df.iloc[idx, :]["_id"]
    doc = nlp(df.iloc[idx, :]["text"])
    for token in doc:
        if token.pos_ == "VERB":
            verbs.append([_id, token.text])
        elif token.pos_ == "ADJ":
            adj.append([_id, token.text])
        elif token.pos_ == "PROPN":
            propn.append([_id, token.text])

In [None]:
df_verbs = pd.DataFrame(verbs, columns=["ids", "verbs"])
df_adj = pd.DataFrame(adj, columns=["ids", "adjectives"])
df_propn = pd.DataFrame(propn, columns=["ids", "porper_noun"])

### Clean the POSs

In [None]:
conditions = (
    df_verbs["verbs"].str.startswith("’")
    | df_verbs["verbs"].str[0].str.isupper()
    | df_verbs["verbs"].str.contains("smith")
    | df_verbs["verbs"].str.contains("chris")
    | df_verbs["verbs"].str.contains("oscar")
    | (df_verbs["verbs"].str.len() <= 2)
)
df_verbs = df_verbs.drop(df_verbs[conditions].index)

In [None]:
conditions = (
    df_adj["adjectives"].str.startswith("’")
    | df_adj["adjectives"].str[0].str.isupper()
    | df_adj["adjectives"].str.contains("smith")
    | df_adj["adjectives"].str.contains("chris")
    | df_adj["adjectives"].str.contains("oscar")
    | (df_adj["adjectives"].str.len() <= 2)
)
df_adj = df_adj.drop(df_adj[conditions].index)

In [None]:
conditions = df_propn["porper_noun"].str.len() <= 2
df_propn = df_propn.drop(df_propn[conditions].index)
df_propn["porper_noun"] = df_propn["porper_noun"].str.lower()

### Create Nodes and Edges

In [None]:
def create_nodes(df: pd.DataFrame, tag: str, count: int = 100) -> pd.DataFrame:
    _df = df.copy(deep=True)
    nodes = _df.value_counts(tag).reset_index().rename(columns={0: "n"})
    nodes["position"] = nodes["n"].rank(ascending=False).astype(int)
    nodes = nodes[nodes["position"] <= count]
    return nodes

In [None]:
def create_edges(
    df: pd.DataFrame, nodes: pd.DataFrame, tag: str, mention_n: int = 2
) -> pd.DataFrame:
    _df = df.merge(nodes[tag], how="inner", on=tag)
    _df[f"n_{tag}"] = _df.groupby("ids")[tag].transform("count")
    _df = _df[_df[f"n_{tag}"] >= mention_n].reset_index().drop(columns=f"n_{tag}")
    _df[tag] = _df[tag].astype(str)
    _df[tag] = _df.groupby(["ids"])[tag].transform(lambda x: " ".join(x))
    _df = _df.reset_index().drop_duplicates(subset="ids")[["ids", tag]].reset_index(drop=True)
    _df[tag] = _df[tag].map(str.split)

    tag_pairs = [list(combinations(i, 2)) for i in _df[tag]]
    all_tag_pairs = list(chain(*tag_pairs))
    tag_pair_count = Counter(all_tag_pairs)

    edges = pd.DataFrame.from_dict(tag_pair_count, orient="index")
    edges[f"{tag}1"], edges[f"{tag}2"] = zip(*edges.index)
    edges.reset_index(inplace=True, drop=True)
    edges = edges.rename(columns={0: "value"}).sort_values("value", ascending=False)
    cols = [f"{tag}1", f"{tag}2", "value"]
    edges = edges[cols]
    edges = (
        edges.merge(nodes, how="inner", left_on=f"{tag}1", right_on=tag)
        .rename(columns={"n": "source_n"})
        .merge(nodes, how="inner", left_on=f"{tag}2", right_on=tag)
        .rename(columns={"n": "target_n"})
    )

    edges[f"{tag}1_share"] = edges.value / edges.source_n
    edges[f"{tag}2_share"] = edges.value / edges.target_n

    edges = edges.query(f"{tag}1_share>0.05 | {tag}2_share>0.05")
    return edges[[f"{tag}1", f"{tag}2", "value", f"{tag}1_share", f"{tag}2_share"]]

In [None]:
def create_partition(G, tag: str, nodes: pd.DataFrame) -> pd.DataFrame:
    partition = community_louvain.best_partition(G)
    louvain_communities = (
        pd.DataFrame.from_dict(partition, orient="index")
        .reset_index()
        .rename(columns={"index": tag, 0: "louvain"})
    )
    nodes = nodes.merge(louvain_communities, how="inner", on=tag)
    return nodes

In [None]:
tag = "verbs"
verb_nodes = create_nodes(df_verbs, tag, 100)
verb_nodes.head()

In [None]:
verb_edges = create_edges(df_verbs, verb_nodes, tag, mention_n=2)
verb_edges.head()

### Create Network Graph

In [None]:
verb_G = nx.Graph()
for index, row in verb_nodes.iterrows():
    verb_G.add_node(row[tag], nodesize=row["n"])
for index, row in verb_edges.iterrows():
    verb_G.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])
verb_nodes_l = create_partition(verb_G, tag, verb_nodes)
verb_nodes_l

In [None]:
def draw_graph(G, size, nodes, k=3, iterations=60):
    plt.figure(figsize=size)
    pos = nx.drawing.spring_layout(G, k=k, iterations=iterations)
    node_size = [d["nodesize"] * 0.7 for _, d in G.nodes(data=True)]
    edge_width = [np.sqrt(d["weight"] * 0.05) for _, _, d in G.edges(data=True)]
    cmap = cm.get_cmap("coolwarm", nodes.louvain.nunique())

    # Draw Network
    nx.draw_networkx(
        G,
        pos=pos,
        node_color=nodes.louvain,
        cmap=cmap,
        node_size=node_size,
        width=edge_width,
        edge_color="grey",
        font_size=16,
        alpha=0.8,
    )
    plt.show()

In [None]:
draw_graph(verb_G, size=(20, 12), nodes=verb_nodes_l, k=4, iterations=50)

In [None]:
tag = "adjectives"
adj_nodes = create_nodes(df_adj, tag, 100)
adj_edges = create_edges(df_adj, adj_nodes, tag, 2)
adj_G = nx.Graph()
for index, row in adj_nodes.iterrows():
    adj_G.add_node(row[tag], nodesize=row["n"])
for index, row in adj_edges.iterrows():
    adj_G.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])
adj_nodes_l = create_partition(adj_G, tag, adj_nodes)

In [None]:
draw_graph(adj_G, size=(20, 12), nodes=adj_nodes_l, k=3, iterations=50)

In [None]:
tag = "porper_noun"
propn_nodes = create_nodes(df_propn, tag, 75)
propn_edges = create_edges(df_propn, propn_nodes, tag, 2)
propn_G = nx.Graph()
for index, row in propn_nodes.iterrows():
    propn_G.add_node(row[tag], nodesize=row["n"])
for index, row in propn_edges.iterrows():
    propn_G.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])
propn_nodes_l = create_partition(propn_G, tag, propn_nodes)

In [None]:
draw_graph(propn_G, size=(20, 12), nodes=propn_nodes_l, k=3, iterations=60)

In [None]:
nt = Network(height="600px", width="1200px", notebook=True)

for index, row in propn_nodes_l.iterrows():
    nt.add_node(
        n_id=row[tag],
        value=row["n"],
        group=row["louvain"],
        shape="dot",
        title=f"Tag: {row[tag]} \n Rank: {row['position']}",
    )
for index, row in propn_edges.iterrows():
    nt.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])

# nt.show_buttons(filter_=["physics"])
nt.barnes_hut(central_gravity=0, spring_length=400, damping=0.2)
nt.set_options(
    """
var options = {
  "physics": {
        "maxVelocity": 28,
        "minVelocity": 0.75},
  "nodes": {"font": {"size": 30}}
}"""
)
nt.show("Slapgate.html")

## Twitters POS

In [None]:
def flatten_annotations(row: list, threshold: float = 0.9):
    results = list()
    if not isinstance(row, list):
        return results
    for entry in row:
        if entry["probability"] > threshold:
            results.append([entry["type"], entry["normalized_text"]])
    return results


def flatten_mentions(row: list):
    results = list()
    if not isinstance(row, list):
        return results
    return [entry["username"] for entry in row]

In [None]:
df = pd.read_parquet("data/slapgate_twitter_mymodel2")

In [None]:
counter = 0
bad_rows = list()
for idx, entity in enumerate(df["entities"]):
    entity = entity.replace("'", '"')
    try:
        entity = json.loads(entity)
    except (AttributeError, JSONDecodeError):
        bad_rows.append(idx)
        counter += 1
print(counter)
df = df.drop(df.index[bad_rows])

In [None]:
df_entities = pd.json_normalize(df.iloc[:, :]["entities"].str.replace("'", '"').apply(json.loads))
df_entities["annotations"] = df_entities["annotations"].apply(flatten_annotations)
df_entities["mentions"] = df_entities["mentions"].apply(flatten_mentions)

In [None]:
df = pd.merge(
    df,
    df_entities.iloc[:, [0, 3]],
    left_index=True,
    right_index=True,
).drop("entities", axis=1)

In [None]:
annotations = list()
mentions = list()
for idx in tqdm(range(len(df))):
    _id = df.iloc[idx, :]["_id"]
    anns = df.iloc[idx, :]["annotations"]
    annotations += [[_id, an[0], an[1]] for an in anns]
    mens = df.iloc[idx, :]["mentions"]
    mentions += [[_id, men] for men in mens]

In [None]:
df_annotations = pd.DataFrame(annotations, columns=["ids", "type", "pnoun"])
df_mentions = pd.DataFrame(mentions, columns=["ids", "mentions"])

In [None]:
df_annotations["type"].value_counts()

In [None]:
tag = "mentions"
mentions_nodes = create_nodes(df_mentions, tag, 75)
mentions_edges = create_edges(df_mentions, mentions_nodes, tag, 2)
mentions_G = nx.Graph()
for index, row in mentions_nodes.iterrows():
    mentions_G.add_node(row[tag], nodesize=row["n"])
for index, row in mentions_edges.iterrows():
    mentions_G.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])
mentions_nodes_l = create_partition(mentions_G, tag, mentions_nodes)

In [None]:
draw_graph(mentions_G, size=(20, 12), nodes=mentions_nodes_l, k=3, iterations=60)

In [None]:
tag = "pnoun"
annotations_nodes = create_nodes(df_annotations, tag, 75)
annotations_edges = create_edges(df_annotations, annotations_nodes, tag, 2)
annotations_G = nx.Graph()
for index, row in annotations_nodes.iterrows():
    annotations_G.add_node(row[tag], nodesize=row["n"])
for index, row in annotations_edges.iterrows():
    annotations_G.add_edge(row[f"{tag}1"], row[f"{tag}2"], weight=row["value"])
annotations_nodes_l = create_partition(annotations_G, tag, annotations_nodes)

In [None]:
draw_graph(annotations_G, size=(20, 12), nodes=annotations_nodes_l, k=3, iterations=60)