In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split

In [2]:
df_cmv=pd.read_csv("../data/cmv_original.csv")

In [3]:
df_cmv["gold_label"].value_counts()

Assumption          2724
Continue            1661
None                1487
Anecdote             323
Statistics/Study      81
Definition            65
Other                 30
Testimony             28
Name: gold_label, dtype: int64

In [4]:
def update_items(df, row_name):
    last_item = df[row_name].loc[0]
    updated_items = []
    for _, item in df[row_name].iteritems():
        if item != "Continue":
            last_item = item
        updated_items.append(last_item)

    df[f"{row_name}_updated"] = updated_items

update_items(df_cmv, "a1")
update_items(df_cmv, "a2")
update_items(df_cmv, "a3")

In [5]:
# update gold label
df_cmv["gold_label_updated"] = [float("nan")] * len(df_cmv["gold_label"])

for i, row in df_cmv.iterrows():

    a1, a2, a3 = str(row["a1_updated"]), str(row["a2_updated"]), str(row["a3_updated"])
    counts = {}

    v_a1 = counts.get(a1, 0)
    counts[a1] = v_a1 + 1
    v_a2 = counts.get(a2, 0)
    counts[a2] = v_a2 + 1
    v_a3 = counts.get(a3, 0)
    counts[a3] = v_a3 + 1

    max_count = max(counts.values())
    max_count_keys = [k for k, v in counts.items() if v == max_count]

    if a1 == 'nan' and a2 == 'nan' and a3 == 'nan':
        df_cmv.at[i, "gold_label_updated"] = row['gold_label'] if row['gold_label'] != "Continue" else float("nan")
    elif max_count == 1:
        df_cmv.at[i, "gold_label_updated"] = float("nan")
    else:
        df_cmv.at[i, "gold_label_updated"] = max_count_keys[0]

In [6]:
df_cmv[["a1_updated", "a2_updated", "a3_updated", "gold_label", "gold_label_updated"]].tail()

Unnamed: 0,a1_updated,a2_updated,a3_updated,gold_label,gold_label_updated
6624,,,,,
6625,,,,Assumption,Assumption
6626,,,,,
6627,,,,Assumption,Assumption
6628,,,,Assumption,Assumption


In [7]:
def is_nan_or_none(x):
    if isinstance(x, float):
        return not math.isnan(x)
    elif isinstance(x, str):
        return x != "None"
    else:
        return bool(x)

df_cmv_filtered = df_cmv[df_cmv["gold_label_updated"].apply(is_nan_or_none)]

In [8]:
df_cmv_filtered["gold_label_updated"].value_counts()

Assumption          2861
Anecdote             370
Statistics/Study      82
Definition            66
Other                 38
Testimony             33
Common ground          1
Name: gold_label_updated, dtype: int64

In [9]:
df_cmv["gold_label"].value_counts()

Assumption          2724
Continue            1661
None                1487
Anecdote             323
Statistics/Study      81
Definition            65
Other                 30
Testimony             28
Name: gold_label, dtype: int64

In [10]:
df_cmv_distilled = df_cmv_filtered[['thread_id', 'comment_id', 'sentence', 'a1_updated', 'a2_updated', 'a3_updated', 'gold_label_updated']]
df_cmv_distilled = df_cmv_distilled[((df_cmv_distilled["gold_label_updated"] == "Definition") | (df_cmv_distilled["gold_label_updated"] == "Common ground")) == False]
df_cmv_distilled.columns = ['thread_id', 'comment_id', 'sentence', 'a1', 'a2', 'a3', 'label']
df_cmv_distilled["label"] = df_cmv_distilled["label"].str.lower()
df_cmv_distilled["label"] = df_cmv_distilled["label"].str.replace("statistics/study", "statistics")

In [11]:
df_cmv_distilled["label"].value_counts()

assumption    2861
anecdote       370
statistics      82
other           38
testimony       33
Name: label, dtype: int64

In [12]:
train, test = train_test_split(df_cmv_distilled, test_size=0.4, random_state=42, stratify=df_cmv_distilled["label"])

In [13]:
train["label"].value_counts()

assumption    1716
anecdote       222
statistics      49
other           23
testimony       20
Name: label, dtype: int64

In [14]:
test["label"].value_counts()

assumption    1145
anecdote       148
statistics      33
other           15
testimony       13
Name: label, dtype: int64

In [15]:
train.to_csv("../data/cmv_train.csv", index=False)
test.to_csv("../data/cmv_test.csv", index=False)