In [1]:
import matplotlib.pyplot as plt
from stance_classification.draw_utils import new_figure
%matplotlib inline

import csv
from itertools import combinations, starmap, groupby, product, chain, islice
from itertools import groupby, chain, product, starmap
from operator import itemgetter
from typing import Any, List, Sequence
import logging

from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from conversant.interactions import InteractionsGraph
from conversant.interactions.interactions_graph import PairInteractionsData
from stance_classification.classifiers.maxcut_stance_classifier import MaxcutStanceClassifier
from stance_classification.user_interaction.cmv_stance_interactions_graph_builder import CMVStanceBasedInteractionGraphBuilder
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

from conversant.conversation.examples.controversial_feature_extraction import *

from tqdm.notebook import tqdm

In [2]:
trees_file_path = '/home/dev/data/stance/cmv/trees_2.0.txt'
total_trees = sum(1 for _ in iter_trees_from_lines(trees_file_path))

print(f'loaded total of {total_trees}')

trees = tqdm(iter_trees_from_lines(trees_file_path), total=total_trees)

loaded total of 16306


HBox(children=(FloatProgress(value=0.0, max=16306.0), HTML(value='')))

In [3]:
def load_labels(path) -> Dict[Tuple[str, str], bool]:
    with open(path, 'r') as labels_f:
        reader = csv.reader(labels_f)
        next(reader) # skip header
        nodes_labels_mapping = {tuple(record[0: 2]): bool(1 + int(record[2])) for record in reader if int(record[2]) != 0}
        return nodes_labels_mapping

In [4]:
conv_reader = CMVConversationReader()
conversations = list(map(conv_reader.parse, trees))
len(conversations)




16306

In [5]:
convs = conversations

In [1]:
336 * 19 * 4

25536

In [17]:
# labeled_trees_path = "/home/<user>/data/bgu/labeled/61019_notcut_trees.txt"
author_labels = load_labels("/home/dev/data/stance/cmv/stance-gs-extended.csv")
# author_labels = load_labels("/home/dev/data/stance/cmv/stance-mturk-gs-v1.7.0.csv")
author_labels = sorted([(r[0][0], r[0][1], r[1]) for r in author_labels.items()], key=itemgetter(0, 1))

def create_author_labels_dict(records: Iterable[Tuple[str, str, str]]) -> Dict[Any, int]:
    return {r[1]: r[2] for r in records}

author_labels_per_conversation = {cid: create_author_labels_dict(records) for cid, records in groupby(author_labels, key=lambda r: r[0])}
author_labels_per_conversation = {k: v for k, v in author_labels_per_conversation.items() if len(v) > 0}
print(len(author_labels_per_conversation))
print(sum(len(v) for v in author_labels_per_conversation.values()))

13
152


In [18]:
# author_labels_per_conversation

In [19]:
def get_author_labels(conv: Conversation) -> Dict[Any, int]:
    if conv.id not in author_labels_per_conversation:
        return None

    return author_labels_per_conversation[conv.id]

def get_maxcut_results(graph: InteractionsGraph, op: Any) -> MaxcutStanceClassifier:
    maxcut = MaxcutStanceClassifier(weight_field=graph.WEIGHT_FIELD)
    maxcut.set_input(graph.graph)
    maxcut.classify_stance(op)
    return maxcut

def align_gs_with_predictions(maxcut: MaxcutStanceClassifier, authors_labels: Dict[Any, int]) -> Tuple[List[int], List[int]]:
    support_label = 1
    opposer_label = 0
    supporters = maxcut.get_supporters()
    opposers = maxcut.get_complement()

    y_true, y_pred = [], []
    for supporter in supporters:
        true_label = authors_labels.get(supporter)
        if true_label is not None:
            y_true.append(true_label)
            y_pred.append(support_label)

    for opposer in opposers:
        true_label = authors_labels.get(opposer)
        if true_label is not None:
            y_true.append(true_label)
            y_pred.append(opposer_label)

    return y_true, y_pred

def predict_for_partition(true: List[int], preds: List[int]) -> Tuple[List[int], List[int]]:
    acc = accuracy_score(true, preds)
    if acc < 0.5:
        preds = [1-l for l in preds]

    return true, preds

In [20]:
interactions_parser = CMVStanceBasedInteractionGraphBuilder()
author_true, author_pred = [], []
author_true_partition, author_pred_partition = [], []
posts_true, posts_pred = [], []
post_true_partition, post_pred_partition = [], []
filtered_convs = []
full_graphs = []
core_graphs = []
maxcut_results: List[MaxcutStanceClassifier] = []
classification_results: List[Tuple[List[int], List[int]]] = []
empty_core = []
unlabeled_conversations = []
unlabeled_op = []
insufficient_author_labels = []
too_small_cut_value = []
op_not_in_core = []
large_graphs = []

def calc_weight(interactions: PairInteractionsData) -> float:
    n_replies = interactions["replies"]
    n_quotes = interactions["quotes"]
    n_deltas = interactions["deltas"]
    # return n_replies + n_quotes
    return n_replies + n_quotes + n_deltas

# """abortion = 3
#    evolution = 7
#    gay marriage = 8
#    gun control = 9
#    """
# convs[0].root.data["topic"]
# conv: Conversation
for conv in tqdm(convs):
    # topic = conv.root.data["topic"]
    # if topic != 9: continue
    authors_labels = get_author_labels(conv)
    if authors_labels is None:
        unlabeled_conversations.append(conv)
        continue

    op = conv.root.author
    # if op not in authors_labels:
    #     unlabeled_op.append(conv)
    #     continue

    if len(authors_labels) < 3:
        insufficient_author_labels.append(conv)
        continue

    interaction_graph = interactions_parser.build(conv)

    interaction_graph.set_interaction_weights(calc_weight)
    zero_edges = [(v, u) for v, u, d in interaction_graph.graph.edges(data=True) if d["weight"] == 0]
    interaction_graph.graph.remove_edges_from(zero_edges)

    core_interactions = interaction_graph.get_core_interactions()
    if op not in core_interactions.graph.nodes:
        op_not_in_core.append(conv)
        continue

    core_interactions = core_interactions.get_op_connected_components()
    if core_interactions.graph.size() < 2:
            empty_core.append(conv)
            continue

    # if core_interactions.graph.order() > 120:
    #     large_graphs.append(conv)
    #     continue

    maxcut = get_maxcut_results(core_interactions, op)
    if maxcut.cut_value < 3:
        too_small_cut_value.append(conv)
        continue

    true, preds = align_gs_with_predictions(maxcut, authors_labels)
    author_true.append(true)
    author_pred.append(preds)

    true, preds = predict_for_partition(true, preds)
    author_true_partition.append(true)
    author_pred_partition.append(preds)

    # true, preds = predict_post_labels(conv, post_labels, maxcut.get_supporters(), maxcut.get_complement())
    # posts_true.append(true)
    # posts_pred.append(preds)

    # true, preds = predict_for_partition(true, preds)
    # post_true_partition.append(true)
    # post_pred_partition.append(preds)

    filtered_convs.append(conv)
    full_graphs.append(interaction_graph)
    core_graphs.append(core_interactions)
    maxcut_results.append(maxcut)
    classification_results.append((true, preds))

HBox(children=(FloatProgress(value=0.0, max=16306.0), HTML(value='')))




In [21]:
print(f"total number of conversations: {len(convs)}")
print(f"total number of conversations with labeled authors: {len(author_labels_per_conversation)}")
print(f"number of conversations in eval: {len(filtered_convs)}")
labeled_authors = sum(len(v) for v in author_labels_per_conversation.values())
print(f"total number of labeled authors: {labeled_authors}")
print(f"number of authors in eval: {sum(map(len, author_true))}")
print(f"number of posts in eval: {sum(map(len, posts_true))}")
print("=========")
print(f"number of conversations with empty core: {len(empty_core)}")
print(f"number of conversations with op not in core: {len(op_not_in_core)}")
print(f"number of conversations with too large core: {len(large_graphs)}")
print(f"number of conversations with too small cut value: {len(too_small_cut_value)}")
print(f"number of unlabeled conversations: {len(unlabeled_conversations)}")
print(f"number of conversations with unlabeled op: {len(unlabeled_op)}")
print(f"number of conversations with insufficient labeled authors: {len(insufficient_author_labels)}")



total number of conversations: 16306
total number of conversations with labeled authors: 13
number of conversations in eval: 13
total number of labeled authors: 152
number of authors in eval: 143
number of posts in eval: 0
number of conversations with empty core: 0
number of conversations with op not in core: 0
number of conversations with too large core: 0
number of conversations with too small cut value: 0
number of unlabeled conversations: 16293
number of conversations with unlabeled op: 0
number of conversations with insufficient labeled authors: 0


In [16]:
y_true = list(chain(*author_true))
y_pred = list(chain(*author_pred))
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       False       0.72      0.79      0.75        67
        True       0.63      0.53      0.58        45

    accuracy                           0.69       112
   macro avg       0.67      0.66      0.67       112
weighted avg       0.68      0.69      0.68       112



In [12]:
y_true = list(chain(*author_true_partition))
y_pred = list(chain(*author_pred_partition))
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       False       0.72      0.79      0.75        67
        True       0.63      0.53      0.58        45

    accuracy                           0.69       112
   macro avg       0.67      0.66      0.67       112
weighted avg       0.68      0.69      0.68       112



In [13]:

def compute_pairs_average_distance(
        pairs: Iterable[Tuple[int, int]],
        embeddings: Sequence[np.ndarray]
) -> float:
    distances = list(starmap(lambda i, j: cosine(embeddings[i], embeddings[j]), pairs))
    return float(np.mean(distances))


def compute_average_angle_from_node(
        node_index: int,
        group_indices: Sequence[int],
        embeddings: Sequence[np.ndarray]
) -> float:
    pairs = ((node_index, i) for i in group_indices)
    return compute_pairs_average_distance(pairs, embeddings)


def compute_group_average_angle(
        group_indices: Sequence[int],
        embeddings: Sequence[np.ndarray]
) -> float:
    pairs = combinations(group_indices, 2)
    return compute_pairs_average_distance(pairs, embeddings)


def compute_cross_groups_average_angle(
        group1: Sequence[int],
        group2: Sequence[int],
        embeddings: Sequence[np.ndarray]
) -> float:
    pairs = product(group1, group2)
    return compute_pairs_average_distance(pairs, embeddings)

In [None]:

supporters_avg_angles = []
opposers_avg_angles = []
mean_cross_angle = []
op2supporters = []
op2opposers = []
for i in range(len(maxcut_results)):
    maxcut = maxcut_results[i]
    op, all_embeddings, supporters, opposers =\
        maxcut.op, maxcut.embeddings, maxcut.get_supporters(), maxcut.get_complement()

    op2supporters.append(compute_average_angle_from_node(op, supporters, all_embeddings))
    op2opposers.append(compute_average_angle_from_node(op, opposers, all_embeddings))

    supporters_avg_angles.append(compute_group_average_angle(supporters, all_embeddings))
    opposers_avg_angles.append(compute_group_average_angle(opposers, all_embeddings))

    mean_cross_angle.append(compute_cross_groups_average_angle(supporters, opposers, all_embeddings))

print(f"total conversations {len(maxcut_results)}")
print(f"supporters avg. cosine {np.nanmean(supporters_avg_angles)}")
print(f"opposers avg. cosine {np.nanmean(opposers_avg_angles)}")
print(f"cross groups avg. cosine {np.mean(mean_cross_angle)}")
print(f"op to supporters avg. cosine {np.mean(op2supporters)}")
print(f"op to opposers avg. cosine {np.mean(op2opposers)}")

In [None]:
strong_convs_indices = []
for i in range(len(filtered_convs)):
    op2s = op2supporters[i]
    op2o = op2opposers[i]
    if op2supporters[i] * op2opposers[i] == 0:
        continue

    diff = op2o - op2s
    ratio = op2o / op2s
    if (ratio > 2) and (diff > 1):
        strong_convs_indices.append(i)

len(strong_convs_indices)


In [None]:
# strong_true, strong_preds = zip(*[classification_results[i] for i in strong_convs_indices])
# strong_true = list(chain(*strong_true))
# strong_preds = list(chain(*strong_preds))
strong_true = list(chain(*[author_true[i] for i in strong_convs_indices]))
strong_preds = list(chain(*[author_pred[i] for i in strong_convs_indices]))
print(classification_report(strong_true, strong_preds))

In [None]:
max_i = 0
max_shape = 0
# sizes = [(i, g.graph.order()) for i, g  in enumerate(core_graphs)]
sizes = [(i, core_graphs[i].graph.order()) for i in strong_convs_indices]
sorted_sized = sorted(sizes, key=itemgetter(1), reverse=True)
sorted_sized[:20]

In [None]:
strong_convs_indices

In [None]:
result_index = 0

maxcut = maxcut_results[result_index]
op, emb, supporters, opposers = maxcut.op, maxcut.embeddings, maxcut.get_supporters(), maxcut.get_complement()

s_cosine = compute_group_average_angle(supporters, emb)
o_cosine = compute_group_average_angle(opposers, emb)
cross_cosine = compute_cross_groups_average_angle(supporters, opposers, emb)
op2support = compute_average_angle_from_node(op, supporters, emb)
op2oppose = compute_average_angle_from_node(op, opposers, emb)
print(f"num supporters: {len(supporters)}")
print(f"num opposers: {len(opposers)}")
print(f"supporters avg. cosine: {s_cosine}")
print(f"opposers avg. cosine: {o_cosine}")
print(f"cross-groups avg. cosine: {cross_cosine}")
print(f"op <-> supporters avg. cosine: {op2support}")
print(f"op <-> opposers avg. cosine: {op2oppose}")
print(f"supporters - opposers diff cosine with op: {op2oppose - op2support}")
print(f"supporters - opposers ratio cosine with op: {op2oppose / op2support}")

#### Author classification results
For the current conversation

In [None]:
true = author_true[result_index]
preds = author_pred[result_index]
print(classification_report(true, preds))

#### Post classification results
For the current conversation

In [None]:
# true = posts_true[result_index]
# preds = posts_pred[result_index]
# print(classification_report(true, preds))

#### Post partition classification results
For the current conversation

In [None]:
# true = post_true_partition[result_index]
# preds = post_pred_partition[result_index]
# print(classification_report(true, preds))


In [None]:
conv = filtered_convs[result_index]
author_labels = get_author_labels(conv)
true_supporters = [n for n, l in author_labels.items() if l == 1]
true_opposers = [n for n, l in author_labels.items() if l == 0]
unknown_labels = set(author_labels.keys()) - (set(supporters) | set(opposers))
len(author_labels), len(true_opposers), len(true_supporters), len(unknown_labels)

In [None]:
plt.figure(figsize=(8,6))


X = np.vstack([np.array(x) for x in emb.values()])
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X)
# X_2d = TSNE(n_components=2).fit_transform(X)
print(pca.explained_variance_)
op = maxcut.op
nodes = emb.keys()
tp_supporters_indices = [i for i, n in enumerate(nodes) if n in true_supporters and n in supporters]
fn_supporters_indices = [i for i, n in enumerate(nodes) if n in true_supporters and n in opposers]
tp_opposers_indices = [i for i, n in enumerate(nodes) if n in true_opposers and n in opposers]
fn_opposers_indices = [i for i, n in enumerate(nodes) if n in true_opposers and n in supporters]
unlabeled_supporters = [i for i, n in enumerate(nodes) if n not in author_labels and n in supporters]
unlabeled_opposers = [i for i, n in enumerate(nodes) if n not in author_labels and n in opposers]

op_index = [i for i, n in enumerate(nodes) if n == op]

plt.scatter(X_2d[tp_supporters_indices, 0], X_2d[tp_supporters_indices, 1], color='g', marker='+')
plt.scatter(X_2d[fn_supporters_indices, 0], X_2d[fn_supporters_indices, 1], color='r', marker='+')
plt.scatter(X_2d[tp_opposers_indices, 0], X_2d[tp_opposers_indices, 1], color='r', marker='x')
plt.scatter(X_2d[fn_opposers_indices, 0], X_2d[fn_opposers_indices, 1], color='g', marker='x')
plt.scatter(X_2d[unlabeled_supporters, 0], X_2d[unlabeled_supporters, 1], color='g', marker='_')
plt.scatter(X_2d[unlabeled_opposers, 0], X_2d[unlabeled_opposers, 1], color='r', marker='_')
plt.scatter([X_2d[op_index, 0]], [X_2d[op_index, 1]], color='b', marker='o')

# colors = ['b' if i == op else 'g' if i in supporters else 'r' for i in nodes]
# markers = ['o' if i ==op else 'x' if i in supporters else '+' for i in nodes]
# plt.scatter(X_2d[:, 0], X_2d[:, 1], color=colors)
# op_index = [i for i, n  in enumerate(nodes) if n == op][0]

In [None]:
new_figure()
graph = maxcut.graph
pos = nx.spring_layout(graph)

all_nodes = list(nodes)
tps = [all_nodes[i] for i in tp_supporters_indices]
fns = [all_nodes[i] for i in fn_supporters_indices]
fno = [all_nodes[i] for i in fn_opposers_indices]
tpo = [all_nodes[i] for i in tp_opposers_indices]
unks = [all_nodes[i] for i in unlabeled_supporters]
unko = [all_nodes[i] for i in unlabeled_opposers]
op = [all_nodes[i] for i in op_index]

nx.draw_networkx_nodes(graph, pos, nodelist=tps, node_color='g', node_shape='s', edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=fns, node_color='g', node_shape='^', edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=fno, node_color='r', node_shape='s', edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=tpo, node_color='r', node_shape='^', edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=unks, node_color='grey', node_shape="s", edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=unko, node_color='grey', node_shape="^", edgecolors="black")
nx.draw_networkx_nodes(graph, pos, nodelist=op, node_color='b', node_shape='o', edgecolors="black")

node_labels = {n: str(n) for n in graph.nodes}
nx.draw_networkx_labels(graph, pos, labels=node_labels, font_color="tab:brown")

# Draw the edges that are in the cut.
edge_weights = [np.log2(graph[e[0]][e[1]]['weight']) for e in maxcut.cut]
nx.draw_networkx_edges(graph, pos, edgelist=maxcut.cut, edge_color="black", width=edge_weights)
#
# # Draw the edges that are not in the cut
leave = [e for e in graph.edges if e not in maxcut.cut]
non_cut_weigths = [np.log2(graph[e[0]][e[1]]['weight']) for e in leave]
nx.draw_networkx_edges(graph, pos, edgelist=leave, edge_color="darkgray")

In [None]:
conv_id = filtered_convs[result_index].id
author_labels = author_labels_per_conversation[conv_id]
print(author_labels)
maxcut.draw(true_labels=author_labels)
