In [2]:
from typing import List, Any, Dict, Tuple, Set, Iterable, Sequence
from operator import itemgetter
from itertools import combinations, starmap, groupby, product, chain, islice

import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import networkx as nx
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from conversant.conversation import Conversation
from conversant.conversation.parse import DataFrameConversationReader
from conversant.interactions import InteractionsGraph
from conversant.interactions.interactions_graph import PairInteractionsData
from conversant.interactions.reply_interactions_parser import get_reply_interactions_parser
from stance_classification.classifiers.base_stance_classifier import BaseStanceClassifier
from stance_classification.classifiers.greedy_stance_classifier import MSTStanceClassifier
from stance_classification.data.iac.convinceme_conversation_parser import ConvinceMeConversationParser
from stance_classification.draw_utils import new_figure
%matplotlib inline

from stance_classification.classifiers.maxcut_stance_classifier import MaxcutStanceClassifier

In [5]:
data_path = "/home/dev/data/stance/convinceme/posts.txt"

df = pd.read_csv(data_path)
# df["post_id"] = df["post_id"].astype(str)
# df["discussion_id"] = df["discussion_id"].astype(str)
# df["author_id"] = df["author_id"].astype(str)
df

Unnamed: 0,discussion_id,post_id,parent_post_id,parent_missing,author_id,username,timestamp,text,discussion_stance,topic,stance,votes
0,1,1,,0,10,king,2007-01-02 00:00:00,"I have never used iMovie before, but I have us...","Mac's iMovie is easier to use, and produces a ...",,,58
1,1,2,,0,12,mknorpp,2007-01-04 00:00:00,EVERYTHING is difficult to do on a PC...Mac's ...,"Mac's iMovie is easier to use, and produces a ...",,,59
2,1,3,,0,1,kittycatmeowmeow,2007-02-01 00:00:00,Everything may be hard to do on a PC if you ar...,"Windows Media Maker is easier to use, and prod...",,,1
3,1,4,2.0,0,1,kittycatmeowmeow,2007-02-01 00:00:00,I have a website.\nI do not have a mac.\nI hav...,"Windows Media Maker is easier to use, and prod...",,,0
4,1,5,6.0,0,1,kittycatmeowmeow,2007-02-01 00:00:00,Maybe you should use better software for the m...,"Windows Media Maker is easier to use, and prod...",,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
65363,5413,1,,0,5782,Rich Dellinger,2012-07-28 00:00:00,TV is just a tool. It is how you use it that m...,No,,,1
65364,5413,2,,0,3747,Leonardo Carvalho,2012-08-05 00:00:00,"The concept of TV and media is brilliant, incr...",No,,,1
65365,5413,3,,0,5783,Samuel Hermansson,2012-08-06 00:00:00,I think that TV has helped us with many things...,Yes,,,0
65366,5413,4,,0,5649,Hany Mohammed,2012-08-16 00:00:00,I will use analogy to explain…\nIf I present a...,No,,,1


In [10]:
discussion_path = "/home/dev/data/stance/convinceme/discussions.csv"
discussions_df = pd.read_csv(discussion_path)
discussions_df

Unnamed: 0,discussion_id,discussion_url,title,topic
0,1,http://www.convinceme.net/debates/1/Windows-Me...,Windows Media Maker 2.0 vs. Mac iMovie,
1,2,http://www.convinceme.net/debates/2/The-Beatle...,The Beatles vs. The Rolling Stones,
2,3,http://www.convinceme.net/debates/3/Is-Sufjan-...,Is Sufjan Stevens Gay?,
3,4,http://www.convinceme.net/debates/4/2008-Presi...,2008 Presidential Elections: Rudy Giuliani vs....,
4,5,http://www.convinceme.net/debates/5/Ninjas-ver...,Ninjas versus Pirates,
...,...,...,...,...
5408,5409,http://www.convinceme.net/debates/2069181831/I...,Is is right that Marlins manager Ozzie Guillen...,
5409,5410,http://www.convinceme.net/debates/2090410726/J...,JESUS CHRIST VS MUHAMMAD,
5410,5411,http://www.convinceme.net/debates/2095956490/s...,should people get victimised for putting a sta...,
5411,5412,http://www.convinceme.net/debates/2103454116/S...,Should (Norwegian) schools have mandatory trip...,


In [11]:
Counter(discussions_df["topic"])

Counter({nan: 5226,
         'gay marriage': 18,
         'climate change': 11,
         'evolution': 17,
         'communism vs capitalism': 8,
         'marijuana legalization': 18,
         'gun control': 20,
         'abortion': 28,
         'Israel/Palestine': 2,
         'existence of God': 18,
         'immigration': 6,
         'death penalty': 27,
         'legalized prostitution': 5,
         'vegetarianism': 3,
         'women in the military': 1,
         'minimum wage: pro or con': 1,
         'obamacare': 4})

In [9]:
from collections import Counter
Counter(df["topic"])

Counter({nan: 58752,
         'gay marriage': 971,
         'climate change': 368,
         'evolution': 882,
         'communism vs capitalism': 231,
         'marijuana legalization': 455,
         'gun control': 391,
         'abortion': 1077,
         'Israel/Palestine': 67,
         'existence of God': 998,
         'immigration': 195,
         'death penalty': 600,
         'legalized prostitution': 145,
         'vegetarianism': 58,
         'women in the military': 34,
         'minimum wage: pro or con': 17,
         'obamacare': 127})

In [3]:
def load_to_dict(path: str) -> dict:
    with open(path, 'r') as f:
        return dict(tuple(l.strip().split('\t',1)) for l in f)


topic_path = "/home/dev/data/stance/IAC/alternative/convinceme/topic.txt"
topics_map = load_to_dict(topic_path)
# topics_map["unknown"] = "unknown"
# topics_map["other"] = "other"

discussion_topic_path = "/home/dev/data/stance/IAC/alternative/convinceme/discussion_topic.txt"
discussion_topic_map = {int(k): topics_map[v] for k, v in load_to_dict(discussion_topic_path).items()}
len(discussion_topic_map)


187

In [4]:
df["topic"] = df["discussion_id"].apply(lambda cid: discussion_topic_map.get(cid, None))
df["topic"].unique()
# df

array([None, 'gay marriage', 'climate change', 'evolution',
       'communism vs capitalism', 'marijuana legalization', 'gun control',
       'abortion', 'Israel/Palestine', 'existence of God', 'immigration',
       'death penalty', 'legalized prostitution', 'vegetarianism',
       'women in the military', 'minimum wage: pro or con', 'obamacare'],
      dtype=object)

In [5]:
df["parent_post_id"] = df.apply(
    lambda row: (str(row["discussion_id"]) + ".0") if pd.isna(row["parent_post_id"]) else row["parent_post_id"],
    axis=1
)
df["parent_post_id"]

0           1.0
1           1.0
2           1.0
3             2
4             6
          ...  
65363    5413.0
65364    5413.0
65365    5413.0
65366    5413.0
65367    5413.0
Name: parent_post_id, Length: 65368, dtype: object

#### add the first post to the dataframe
add the record that started the discussion as posts in the discussion, so the conversation parser would add them as records.


In [6]:
new_records = []
for discussion_id in df["discussion_id"].unique():
    record = {
        "topic": discussion_topic_map.get(discussion_id, None),
        "discussion_id": discussion_id,
        "post_id": str(discussion_id) + ".0",
        "author_id": "!UNK",
        "creation_date": "00:00",
        "parent_post_id": None,
        "parent_missing": 0,
        "text_id": -1,
        "points": 0,
        "discussion_stance_id": 0.5,
        "is_rebuttal": None
    }
    new_records.append(record)

In [7]:
df = df.append(new_records, ignore_index=True)
df

Unnamed: 0,discussion_id,post_id,author_id,creation_date,parent_post_id,parent_missing,text_id,points,discussion_stance_id,is_rebuttal,topic
0,1,1,10,2007-01-02 00:00:00,1.0,0,1,58,1.0,1,
1,1,2,12,2007-01-04 00:00:00,1.0,0,2,59,1.0,1,
2,1,3,1,2007-02-01 00:00:00,1.0,0,3,1,0.0,1,
3,1,4,1,2007-02-01 00:00:00,2,0,4,0,0.0,1,
4,1,5,1,2007-02-01 00:00:00,6,0,5,0,0.0,1,
...,...,...,...,...,...,...,...,...,...,...,...
70776,5409,5409.0,!UNK,00:00,,0,-1,0,0.5,,
70777,5410,5410.0,!UNK,00:00,,0,-1,0,0.5,,
70778,5411,5411.0,!UNK,00:00,,0,-1,0,0.5,,
70779,5412,5412.0,!UNK,00:00,,0,-1,0,0.5,,


In [8]:
pasre_strategy = {
    "node_id": "post_id",
    "author": "author_id",
    "timestamp": "creation_date",
    "parent_id": "parent_post_id"
    }
parser = DataFrameConversationReader(pasre_strategy)
gb = df.groupby("discussion_id")
convs: List[Conversation] = list(tqdm(map(parser.parse, map(itemgetter(1), gb))))
len(convs)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




5413

In [9]:
# pasre_strategy = {
#     "node_id": "post_id",
#     "author": "author_id",
#     "timestamp": "creation_date",
#     "parent_id": "parent_post_id"
#     }
#
# def build_conversations(data: pd.DataFrame) -> Iterable[Conversation]:
#     parser = ConvinceMeConversationParser()
#     for discussion_id, posts in data.groupby("discussion_id"):
#         conversation = parser.parse((discussion_id, posts))
#         yield conversation

In [10]:
# convs = list(tqdm(build_conversations(df)))
# len(convs)

In [11]:
sub_convs = [Conversation(child, conversation_id=str(conv.id) + str(child.node_id)) for conv in convs for child in conv.root.children]
len(sub_convs)

32220

### create post labels dictionary

In [12]:
post_labels = {(conv.id, node.node_id): node.data["discussion_stance_id"] for conv in sub_convs for _,node in conv.iter_conversation() if node.data["discussion_stance_id"] != 0.5}
len(post_labels)


65368

In [13]:
def get_majority_vote(labels: List[int]) -> int:
    return int(np.mean(labels) >= 0.5)

def create_author_labels(c: Conversation) -> Dict[Any, int]:
    authors_post_labels = {}
    conv_id = c.id
    for depth, node in c.iter_conversation():
        author = node.author
        post_label = post_labels.get((conv_id, node.node_id), None)
        if post_label is None: continue
        current_author_labels = authors_post_labels.setdefault(author, [])
        current_author_labels.append(post_label)

    result_labels = {a: get_majority_vote(labels) for a, labels in authors_post_labels.items()}
    return result_labels

author_labels_per_conversation = {c.id: create_author_labels(c) for c in sub_convs}
author_labels_per_conversation = {k: v for k, v in author_labels_per_conversation.items() if len(v) > 0 and not (len(v) == 1 and None in v)}
print(len(author_labels_per_conversation))
print(sum(len(v) for v in author_labels_per_conversation.values()))

32220
49930


In [14]:
def get_ordered_candidates_for_pivot(graph: nx.Graph, weight_field: str = "weight") -> Sequence[Any]:
    inv_weight_field = "inv_weight"
    for _, _, pair_data in graph.edges(data=True):
        weight = pair_data.data[weight_field]
        pair_data.data[inv_weight_field] = 1 / weight

    node_centralities = nx.closeness_centrality(graph, distance=inv_weight_field)
    return list(map(itemgetter(0), sorted(node_centralities.items(), key=itemgetter(1), reverse=True)))

def get_pivot_node(graph: nx.Graph, labeled_authors: Set[Any], weight_field: str = "weight") -> Any:
    candidates = get_ordered_candidates_for_pivot(graph, weight_field=weight_field)
    return next(iter(filter(labeled_authors.__contains__, candidates)), None)

In [15]:
def extend_preds(graph: nx.Graph, seed_node: Any, core_authors_preds: Dict[Any, int]) -> Dict[Any, int]:
    extended_results = dict(core_authors_preds.items())
    for (n1, n2) in nx.bfs_edges(graph, source=seed_node):
        if n2 not in extended_results:
            n1_label = extended_results[n1]
            extended_results[n2] = 1 - n1_label

    return extended_results

def get_authors_labels_in_conv(conv: Conversation) -> Dict[Any, int]:
    if conv.id not in author_labels_per_conversation:
        return None

    return author_labels_per_conversation[conv.id]

def get_author_preds(clf: BaseStanceClassifier, pivot: Any) -> Dict[Any, int]:
    support_label = authors_labels[pivot]
    opposer_label = 1 - support_label
    supporters = clf.get_supporters()
    opposers = clf.get_complement()
    preds = {}
    for supporter in supporters:
        preds[supporter] = support_label
    for opposer in opposers:
        preds[opposer] = opposer_label

    return preds

def get_maxcut_results(graph: InteractionsGraph, op: Any) -> MaxcutStanceClassifier:
    maxcut = MaxcutStanceClassifier(weight_field=graph.WEIGHT_FIELD)
    maxcut.set_input(graph.graph, op)
    maxcut.classify_stance()
    return maxcut

def get_greedy_results(graph: InteractionsGraph, op: Any) -> BaseStanceClassifier:
    clf = MSTStanceClassifier()#weight_field=graph.WEIGHT_FIELD)
    clf.set_input(graph.graph)
    clf.classify_stance(op)
    return clf

def align_gs_with_predictions(authors_labels: Dict[Any, int], author_preds: Dict[Any, int]) -> Tuple[List[int], List[int]]:
    y_true, y_pred = [], []
    for author, true_label in authors_labels.items():
        pred = author_preds.get(author, None)
        if pred is None: continue

        y_true.append(true_label)
        y_pred.append(pred)

    return y_true, y_pred

def predict_for_partition(true: List[int], preds: List[int]) -> Tuple[List[int], List[int]]:
    acc = accuracy_score(true, preds)
    if acc < 0.5:
        preds = [1-l for l in preds]

    return true, preds

def get_best_preds(true_labels: Dict[Any, int], pred_labels: Dict[Any, int]) -> Dict[Any, int]:
    true, preds = align_gs_with_predictions(true_labels, pred_labels)
    acc = accuracy_score(true, preds)
    if acc < 0.5:
        return {k: (1-  l) for k, l in pred_labels.items()}

    return pred_labels

def get_posts_preds(conv: Conversation, post_labels: Dict[Any, int], author_preds: Dict[Any, int]) -> Tuple[Dict[Any, int], Dict[Any, int]]:
    posts_true, posts_pred = {}, {}
    conv_id = conv.id
    for depth, node in conv.iter_conversation():
        label = post_labels.get((conv_id, node.node_id), None)
        if label is None: continue
        pred = author_preds.get(node.author, None)
        if pred is None: continue

        posts_true[node.node_id] = label
        posts_pred[node.node_id] = pred

    return posts_true, posts_pred


In [16]:
interactions_parser = get_reply_interactions_parser()

convs_by_id: Dict[Any, Conversation] = {}
full_graphs: Dict[Any, InteractionsGraph] = {}
core_graphs: Dict[Any, InteractionsGraph] = {}
maxcut_results: Dict[Any, MaxcutStanceClassifier] = {}
pivot_nodes = {}

author_predictions: Dict[Any, Dict[str, Dict[Any, int]]] = {}
posts_predictions: Dict[Any, Dict[str, Dict[Any, int]]] = {}



empty_core = []
unlabeled_conversations = []
unlabeled_op = []
insufficient_author_labels = []
too_small_cut_value = []
op_not_in_core = []
large_graphs = []
single_author_conv = []

extend_results = False
naive_results = False

def calc_weight(interactions: PairInteractionsData) -> float:
    n_replies = interactions["replies"]
    # n_quotes = interactions["quotes"]
    return n_replies
    # return n_quotes

"""abortion = 3
   evolution = 7
   gay marriage = 8
   gun control = 9
   """
# convs[0].root.data["topic"]
# conv: Conversation
count_conv = 0
for i, conv in tqdm(enumerate(sub_convs)):
    # topic = conv.root.data["topic"]
    # if topic not in relevant_topics: continue

    count_conv += 1
    authors_labels = get_authors_labels_in_conv(conv)
    if authors_labels is None:
        unlabeled_conversations.append(i)
        continue

    if len(authors_labels) == 0:
        insufficient_author_labels.append(i)
        continue

    interaction_graph = interactions_parser.parse(conv)
    interaction_graph.set_interaction_weights(calc_weight)
    zero_edges = [(v, u) for v, u, d in interaction_graph.graph.edges(data=True) if d["weight"] == 0]
    interaction_graph.graph.remove_edges_from(zero_edges)

    if len(conv.participants) <= 1:
        single_author_conv.append(i)
        continue

    convs_by_id[conv.id] = conv
    full_graphs[conv.id] = interaction_graph

    pivot_node = get_pivot_node(interaction_graph.graph, authors_labels, weight_field="weight")
    pivot_nodes[conv.id] = pivot_node

    mst = get_greedy_results(interaction_graph, pivot_node)
    preds = get_author_preds(mst, pivot_node)
    author_predictions[conv.id] = {"mst": preds}

    if naive_results:
        continue

    core_interactions = interaction_graph.get_core_interactions()
    core_graphs[conv.id] = core_interactions
    if core_interactions.graph.size() == 0:
        empty_core.append(i)
        continue

    components = list(nx.connected_components(core_interactions.graph))
    core_interactions = core_interactions.get_subgraph(components[0])
    pivot_node = get_pivot_node(core_interactions.graph, authors_labels, weight_field="weight")
    maxcut = get_maxcut_results(core_interactions, pivot_node)
    if maxcut.cut_value < 3:
        too_small_cut_value.append(i)
        continue

    maxcut_results[conv.id] = maxcut

    # if core_interactions.graph.order() > 120:
    #     large_graphs.append(conv)
    #     continue

    preds = get_author_preds(maxcut, pivot_node)
    author_predictions[conv.id]["core"] = preds

    # get extended results
    preds = extend_preds(interaction_graph.graph, pivot_node, preds)
    author_predictions[conv.id]["full"] = preds

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [17]:
print(f"total number of conversations (in all topics): {len(convs)}")
print(f"total number of conversations (in the relevant topics): {count_conv}")
print(f"total number of conversations with labeled authors (in all topics): {len(author_labels_per_conversation)}")
print(f"total number of conversations with labeled authors (in the relevant topics): {count_conv - len(unlabeled_conversations)}")

print(f"number of conversations in eval: {len(convs_by_id)}")
all_authors_in_eval = set(chain(*[predictions["mst"].keys() for predictions in author_predictions.values()]))
print(f"number of unique authors in eval: {len(all_authors_in_eval)}")
all_authors_in_core_eval = set(chain(*[predictions.get("core", {}).keys() for predictions in author_predictions.values()]))
print(f"number of unique authors in core: {len(all_authors_in_core_eval)}")

labeled_authors = sum(len(v) for v in author_labels_per_conversation.values())
print(f"total number of labeled authors: {labeled_authors}")
print("=========")
print(f"number of conversations with single author: {len(single_author_conv)}")
print(f"number of conversations with empty core: {len(empty_core)}")
print(f"number of conversations with op not in core: {len(op_not_in_core)}")
print(f"number of conversations with too large core: {len(large_graphs)}")
print(f"number of conversations with too small cut value: {len(too_small_cut_value)}")
print(f"number of unlabeled conversations: {len(unlabeled_conversations)}")
print(f"number of conversations with unlabeled op: {len(unlabeled_op)}")
print(f"number of conversations with insufficient labeled authors: {len(insufficient_author_labels)}")

total number of conversations (in all topics): 5413
total number of conversations (in the relevant topics): 32220
total number of conversations with labeled authors (in all topics): 32220
total number of conversations with labeled authors (in the relevant topics): 32220
number of conversations in eval: 9521
number of unique authors in eval: 3641
number of unique authors in core: 490
total number of labeled authors: 49930
number of conversations with single author: 22699
number of conversations with empty core: 8970
number of conversations with op not in core: 0
number of conversations with too large core: 0
number of conversations with too small cut value: 51
number of unlabeled conversations: 0
number of conversations with unlabeled op: 0
number of conversations with insufficient labeled authors: 0


In [40]:
from sklearn.metrics import recall_score, precision_score
relevant_topics = set(discussion_topic_map.values())
results = {}
others = {}
for conv_id, prediction in author_predictions.items():
    conv = convs_by_id[conv_id]
    topic = conv.root.data["topic"]
    if  topic not in relevant_topics:
        topic = "other"

    author_labels = get_authors_labels_in_conv(conv)
    author_preds = prediction.get("full", prediction["mst"])
    best_preds = get_best_preds(author_labels, author_preds)
    y_true, y_pred = align_gs_with_predictions(author_labels, best_preds)
    acc = accuracy_score(y_true, y_pred)

    posts_true, posts_preds = get_posts_preds(conv, post_labels, author_preds)
    best_preds = get_best_preds(posts_true, posts_preds)
    p_true, p_pred = align_gs_with_predictions(posts_true, best_preds)
    p_acc = accuracy_score(p_true, p_pred)

    accuracies, p_accuracies, sizes, psizes = results.setdefault(topic, ([], [], [], []))
    accuracies.append(acc)
    p_accuracies.append(p_acc)
    sizes.append(len(y_true))
    psizes.append(len(p_true))
    # print(f"{topic} {len(sizes)}\t{sum(sizes)}\t{macro_acc}\t{micro_acc}")
len(results)

17

In [43]:
for topic, (accuracies, p_accuracies, sizes, psizes) in results.items():
    macro_acc = np.mean(accuracies)
    p_acc = np.mean(p_accuracies)
    micro_acc = np.dot(accuracies, sizes) / sum(sizes)
    micro_p_acc = np.dot(p_accuracies, psizes) / sum(sizes)
    print(f" {topic.title()} & {sum(psizes)} & {p_acc:.2f} & - \\\\")

 Other & 37537 & 0.95 & - \\
 Gay Marriage & 708 & 0.98 & - \\
 Evolution & 688 & 0.99 & - \\
 Communism Vs Capitalism & 185 & 0.99 & - \\
 Marijuana Legalization & 261 & 0.98 & - \\
 Gun Control & 314 & 0.95 & - \\
 Abortion & 834 & 0.96 & - \\
 Climate Change & 255 & 1.00 & - \\
 Israel/Palestine & 36 & 1.00 & - \\
 Existence Of God & 842 & 0.98 & - \\
 Immigration & 166 & 0.87 & - \\
 Death Penalty & 474 & 0.98 & - \\
 Legalized Prostitution & 108 & 0.88 & - \\
 Vegetarianism & 43 & 1.00 & - \\
 Women In The Military & 22 & 1.00 & - \\
 Minimum Wage: Pro Or Con & 14 & 0.95 & - \\
 Obamacare & 101 & 0.98 & - \\


### Evalute Results

In [None]:
all_authors_in_eval = set(chain(*[predictions["mst"].keys() for predictions in author_predictions.values()]))
print(len(all_authors_in_eval))

In [None]:
all_authors_in_core_eval = set(chain(*[predictions.get("core", {}).keys() for predictions in author_predictions.values()]))
len(all_authors_in_core_eval)

In [None]:
sum(1 for p in author_predictions.values() if "core" in p)

In [None]:
for predictor in ["core", "full", "mst"]:
    all_true, all_pred = [], []
    all_true_best, all_pred_best = [], []

    for conv_id, predictions in author_predictions.items():
        conv = convs_by_id[conv_id]
        author_labels = get_authors_labels_in_conv(conv)
        author_preds = predictions.get(predictor, None)
        if author_preds is None: continue

        y_true, y_pred = align_gs_with_predictions(author_labels, author_preds)
        all_true.extend(y_true)
        all_pred.extend(y_pred)

        best_preds = get_best_preds(author_labels, author_preds)
        y_true, y_pred = align_gs_with_predictions(author_labels, best_preds)
        all_true_best.extend(y_true)
        all_pred_best.extend(y_pred)

    acc = accuracy_score(all_true, all_pred)
    acc_best = accuracy_score(all_true_best, all_pred_best)
    print(f"accuracy: {acc}")
    print(f"accuracy-best: {acc_best}")
    print(f"Showing results of predictor: {predictor}")
    print(classification_report(all_true, all_pred))
    print(f"\n\t\tResults for best partition (regardless for stance assignment")
    print(classification_report(all_true_best, all_pred_best))
    print("-----------------------------------------------------------------------------")


In [None]:
for predictor in ["core", "full", "mst"]:
    all_true, all_pred = [], []
    all_true_best, all_pred_best = [], []

    for conv_id, predictions in author_predictions.items():
        conv = convs_by_id[conv_id]
        author_labels = get_authors_labels_in_conv(conv)
        author_preds = predictions.get(predictor, None)
        if author_preds is None: continue

        posts_true, posts_preds = get_posts_preds(conv, post_labels, author_preds)

        y_true, y_pred = align_gs_with_predictions(posts_true, posts_preds)
        all_true.extend(y_true)
        all_pred.extend(y_pred)

        best_preds = get_best_preds(posts_true, posts_preds)
        y_true, y_pred = align_gs_with_predictions(posts_true, best_preds)
        all_true_best.extend(y_true)
        all_pred_best.extend(y_pred)

    print(f"Showing results of predictor: {predictor}")
    print(classification_report(all_true, all_pred))
    print(f"\n\tResults for best partition (regardless for stance assignment")
    print(classification_report(all_true_best, all_pred_best))
    print("-----------------------------------------------------------------------------")

In [None]:
accs = [82,80,64,70,35,82,75,76,84,37,73,33,88,85,19,73,63,50]
print(len(accs))
np.mean(accs)

In [None]:

interaction_graph = reply_interactions_parser.parse(convs[4])
layout = nx.spring_layout(interaction_graph.graph)
nx.draw(interaction_graph.graph, layout)

In [None]:
def get_graphs():
    for j, conv in enumerate(large_convs):
        interaction_graph = reply_interactions_parser.parse(conv)
        if interaction_graph.graph.order() > 10:
            kcore = interaction_graph.get_core_interactions()
            if kcore.graph.order() > 5:
                print(j)
                yield(interaction_graph)

graphs = get_graphs()

i = next(graphs)
layout = nx.spring_layout(i.graph)
nx.draw(i.graph, layout)

In [None]:
i = next(graphs)
layout = nx.spring_layout(i.graph)
nx.draw(i.graph, layout)

In [None]:
i = next(graphs)
layout = nx.spring_layout(i.graph)
nx.draw(i.graph, layout)

In [None]:
i = next(graphs)
layout = nx.spring_layout(i.graph)
nx.draw(i.graph, layout)

In [None]:
print(large_convs[0])

In [None]:
supporters_avg_angles = []
opposers_avg_angles = []
mean_cross_angle = []

for i in range(len(results)):
    r = results[i]
    supporters_distances = list(starmap(lambda i, j: cosine(r[0][i], r[0][j]), combinations(r[1], 2)))
    opposers_distances = list(starmap(lambda i, j: cosine(r[0][i], r[0][j]), combinations(r[2], 2)))
    supporters_avg_angle = np.mean(supporters_distances)
    opposers_avg_angle = np.mean(opposers_distances)

    supporters_avg_angles.append(supporters_avg_angle)
    opposers_avg_angles.append(opposers_avg_angle)

    supporters_mean_embedding = np.mean([np.array(r[0][i]) for i in r[1]], axis=0)[0]
    opposers_mean_embedding = np.mean([np.array(r[0][i]) for i in r[2]], axis=0)[0]
    cross_angle = cosine(supporters_mean_embedding, opposers_mean_embedding)
    mean_cross_angle.append(cross_angle)

print(f"support {len(results)}")
print(f"support cosine {len(results)}")
print(f"support {len(results)}")
print(f"support {len(results)}")
len(results), np.nanmean(supporters_avg_angles), np.nanmean(opposers_avg_angles), np.mean(mean_cross_angle)

In [None]:
max_i = 0
max_shape = 0
sizes = [(i, len(r[0])) for i, r in enumerate(results)]
sorted_sized = sorted(sizes, key=itemgetter(1), reverse=True)
sorted_sized[:20]



In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
r = results[0]
X = np.vstack([np.array(x) for x in r[0].values()])
X_pca = PCA(n_components=2).fit_transform(X)
X_tsne = TSNE(n_components=2).fit_transform(X)

In [None]:
nodes = r[0].keys()
colors = ['r' if i in r[1] else 'g' for i in nodes]
plt.scatter(X_pca[:, 0], X_pca[:, 1], color=colors)
#

In [None]:
# plt.scatter(X_pca[:, 0], X_pca[:, 1], color=colors)


In [None]:
df.is_rebuttal.describe()