In [32]:
from typing import List, Any, Dict, Tuple, Set
from operator import itemgetter
from itertools import combinations, starmap


import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import networkx as nx
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline

from conversant.conversation import Conversation
from conversant.interactions import InteractionsGraph
from conversant.conversation.parse import DataFrameConversationReader
from conversant.interactions.reply_interactions_parser import get_reply_interactions_parser
from stance_classification.classifiers.maxcut_stance_classifier import MaxcutStanceClassifier

In [76]:
data_path = "/home/dev/data/stance/IAC/alternative/createdebate_released/post.txt"
header = ["discussion_id", "post_id", "author_id", "creation_date", "parent_post_id",
          "parent_missing", "text_id", "points", "discussion_stance_id", "is_rebuttal"]

df = pd.read_csv(data_path, sep='\t', names=header, na_values="\\N")
df

Unnamed: 0,discussion_id,post_id,author_id,creation_date,parent_post_id,parent_missing,text_id,points,discussion_stance_id,is_rebuttal
0,878,3557,27083,2008-05-07 23:06:11,,0,513135,-3,0.0,
1,878,3563,5901,2008-05-08 01:25:25,,0,513160,-8,0.0,
2,878,3623,11010,2008-05-08 15:03:14,,0,513211,11,1.0,
3,878,3627,27083,2008-05-08 15:21:42,3623.0,0,513212,8,0.0,disputed
4,878,3632,8705,2008-05-08 15:35:54,,0,513276,4,1.0,
...,...,...,...,...,...,...,...,...,...,...
3046,56718,558884,1488,2014-05-09 05:30:18,,0,771300,1,0.0,
3047,56718,558914,2827,2014-05-09 13:40:52,558884.0,0,771301,1,1.0,disputed
3048,56718,558915,2827,2014-05-09 13:41:09,,0,771304,1,1.0,
3049,56718,558927,2496,2014-05-09 16:26:02,558914.0,0,771302,1,1.0,supported


In [77]:
len(df["discussion_id"].unique())

63

In [78]:
df["parent_post_id"].isna().sum()

993

In [79]:
pasre_strategy = {
    "node_id": "post_id",
    "author": "author_id",
    "timestamp": "creation_date",
    "parent_id": "parent_post_id"
    }
parser = DataFrameConversationReader(pasre_strategy)

In [80]:
gb = df.groupby("discussion_id")

In [81]:
it = iter(gb)

In [82]:
name, ddf = next(it)
print(it)
conv = parser.parse(ddf)
conv

<generator object BaseGrouper.get_iterator at 0x7f8106002510>


27083 - 3557
├── 24659 - 3641
│   ├── 3346 - 3645
│   │   ├── 17754 - 3716
│   │   │   ├── 24659 - 3723
│   │   │   └── 3346 - 3853
│   │   │       └── 17754 - 3937
│   │   └── 21084 - 30475
│   ├── 27083 - 3662
│   │   ├── 24659 - 3669
│   │   │   └── 27784 - 3711
│   │   └── 16689 - 5282
│   ├── 28180 - 4710
│   └── 28608 - 10247
│       └── 23602 - 40757
├── 16689 - 4929
├── 26420 - 28947
├── 8448 - 74263
│   └── 27680 - 84949
└── 28296 - 225691

In [83]:
convs = list(map(parser.parse, map(itemgetter(1), gb)))

In [84]:
sizes = [c.size for c in convs]
print(len(sizes))
print(np.mean(sizes))
print(np.median(sizes))

63
7.380952380952381
3.0


In [85]:
filtered_sizes = [s for s in sizes if s >= 5]
print(len(filtered_sizes))
print(np.mean(filtered_sizes))
print(np.median(filtered_sizes))



22
17.681818181818183
14.0


In [86]:
large_convs = [c for c in convs if c.size >=10]

In [87]:
def get_majority_vote(labels: List[int]) -> int:
    return int(np.mean(labels) >= 0.5)

def get_author_labels(conv: Conversation) -> Dict[Any, int]:
    authors_post_labels = {}
    for depth, node in conv.iter_conversation():
        data = node.data
        author = node.author
        author_labels = authors_post_labels.setdefault(author, [])
        author_labels.append(data["discussion_stance_id"])

    authors_labels = {a: get_majority_vote(labels) for a, labels in authors_post_labels.items()}
    return authors_labels

def get_maxcut_results(graph: InteractionsGraph, op: Any) -> MaxcutStanceClassifier:
    maxcut = MaxcutStanceClassifier()
    maxcut.set_input(graph.graph)
    maxcut.classify_stance(op)
    # maxcut.draw()
    return maxcut

def align_gs_with_predictions(maxcut: MaxcutStanceClassifier, authors_labels: Dict[Any, int]) -> Tuple[List[int], List[int]]:
    support_label = authors_labels[op]
    opposer_label = 1 - support_label
    supporters = maxcut.get_supporters()
    opposers = maxcut.get_complement()

    y_true, y_pred = [], []
    for supporter in supporters:
        true_label = authors_labels[supporter]
        y_true.append(true_label)
        y_pred.append(support_label)

    for opposer in opposers:
        true_label = authors_labels[opposer]
        y_true.append(true_label)
        y_pred.append(opposer_label)

    return y_true, y_pred

def align_posts_gs_with_predictions(conv: Conversation, authors_labels: Dict[Any, int]) -> Tuple[List[int], List[int]]:
    y_true, y_pred = [], []
    for (_, node) in conv.iter_conversation():
        y_true.append(node.data["discussion_stance_id"])
        pred = authors_labels[node.author]
        y_pred.append(pred)

    return y_true, y_pred

In [88]:
reply_interactions_parser = get_reply_interactions_parser()
y_true, y_pred = [], []
posts_true, posts_pred = [], []
filtered_convs = []
empty_core = []
results: List[Tuple[np.ndarray, Set[int], Set[int]]] = []

for conv in convs:
    interaction_graph = reply_interactions_parser.parse(conv)
    op = conv.root.author
    interaction_graph.graph.add_edge(op, -1, replies=1)
    interaction_graph.graph.add_edge(op, -2, replies=1)
    interaction_graph.graph.add_edge(-1, -2, replies=1)

    interaction_graph.set_interaction_weights(lambda x: x['replies'])
    if len(list(interaction_graph.graph.neighbors(op))) <= 1:
        op_not_connected.append(conv)
        continue

    kcore = interaction_graph.get_core_interactions()
    if kcore.graph.order() == 0:
        empty_core.append(conv)
        continue

    if op not in kcore.graph.nodes:
        print(kcore.graph.nodes)
        op_not_in_core.append(conv)
        continue

    kcore.graph.remove_nodes_from([-1, -2])
    kcore = kcore.get_op_connected_components()
    maxcut = get_maxcut_results(kcore, op)
    authors_labels = get_author_labels(conv)
    true, preds = align_gs_with_predictions(maxcut, authors_labels)
    y_true.extend(true)
    y_pred.extend(preds)

    true, preds = align_posts_gs_with_predictions(conv, authors_labels)
    posts_true.extend(true)
    posts_pred.extend(preds)
    filtered_convs.append(conv)
    results.append((maxcut.embeddings, maxcut.get_supporters(), maxcut.get_complement()))


print(f"total number of conversations: {len(convs)}")
print(f"number of conversations with empty core: {len(empty_core)}")
print(f"number of conversations in eval: {len(filtered_convs)}")
print(f"number of authors in eval: {len(y_true)}")
print(f"number of posts in eval: {len(posts_true)}")

total number of conversations: 63
number of conversations with empty core: 59
number of conversations in eval: 4
number of authors in eval: 17
number of posts in eval: 77


In [89]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.71      0.63         7
           1       0.75      0.60      0.67        10

    accuracy                           0.65        17
   macro avg       0.65      0.66      0.65        17
weighted avg       0.67      0.65      0.65        17



In [90]:
print(classification_report(posts_true, posts_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        29
         1.0       0.94      0.98      0.96        48

    accuracy                           0.95        77
   macro avg       0.95      0.94      0.94        77
weighted avg       0.95      0.95      0.95        77



In [91]:
# layout = nx.spring_layout(interaction_graph.graph)
# nx.draw(interaction_graph.graph, layout)

In [92]:
supporters_avg_angles = []
opposers_avg_angles = []
mean_cross_angle = []

for i in range(len(results)):
    r = results[i]
    supporters_distances = list(starmap(lambda i, j: cosine(r[0][i], r[0][j]), combinations(r[1], 2)))
    opposers_distances = list(starmap(lambda i, j: cosine(r[0][i], r[0][j]), combinations(r[2], 2)))
    supporters_avg_angle = np.mean(supporters_distances)
    opposers_avg_angle = np.mean(opposers_distances)

    supporters_avg_angles.append(supporters_avg_angle)
    opposers_avg_angles.append(opposers_avg_angle)

    supporters_mean_embedding = np.mean([np.array(r[0][i]) for i in r[1]], axis=0)[0]
    opposers_mean_embedding = np.mean([np.array(r[0][i]) for i in r[2]], axis=0)[0]
    cross_angle = cosine(supporters_mean_embedding, opposers_mean_embedding)
    mean_cross_angle.append(cross_angle)

len(results), np.nanmean(supporters_avg_angles), np.nanmean(opposers_avg_angles), np.mean(mean_cross_angle)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(4, 0.27777654656912737, 0.572914746922623, 1.9532000665376241)

In [93]:
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# X = np.vstack([np.array(x) for x in r[0].values()])
# X_pca = PCA(n_components=2).fit_transform(X)
# X_tsne = TSNE(n_components=2).fit_transform(X)

In [94]:
# nodes = r[0].keys()
# colors = ['r' if i in r[1] else 'g' for i in nodes]
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], color=colors)
#

In [95]:
# plt.scatter(X_pca[:, 0], X_pca[:, 1], color=colors)




In [96]:
cosine([1,1,1,1], [-1,-1,-1,-1])


2.0

In [97]:

df.is_rebuttal.describe()

count         2058
unique           3
top       disputed
freq          1703
Name: is_rebuttal, dtype: object