In [1]:
import json
import os
import re
import itertools
from os.path import join
from collections import Counter, defaultdict

import pandas as pd
import spacy
import en_core_web_sm
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from utils import top_n_specific

%load_ext lab_black

### Define the variables

In [2]:
DATA_DIR = "../data"

### Setup the spaCy's tokenizer and sentence maker

In [3]:
nlp = en_core_web_sm.load()
tokenizer = English().Defaults.create_tokenizer(nlp)

sent = English()
sent.add_pipe(nlp.create_pipe("sentencizer"))

### Read data

#### Read JSON data

In [4]:
with open(join(DATA_DIR, "output.json")) as f:
    raw = f.read()
    unified = json.loads(raw)

#### Read the train-test split

In [5]:
split = pd.read_csv("provided_data/train-test-split.csv", sep=";")

### Get the train data only for statistical analysis

In [6]:
is_row_train = split.SET.str.lower() == "train"
train = split[is_row_train]
train_indices = (
    train.ID.str.extract("(\d+)").squeeze().astype(int) - 1
)  # -1 for 0-based indexing
train_data = [unified[idx] for idx in train_indices]

### Calculate the statistical measures

In [7]:
stats = {
    "num_essays": 0,
    "num_maj_claim": 0,
    "num_claims": 0,
    "num_premises": 0,
    "num_paras": 0,
    "num_true_conf_bias": 0,
    "num_false_conf_bias": 0,
    "num_suff_paras": 0,
    "num_insuff_paras": 0,
    "num_tokens": 0,
    "num_sentences": 0,
    "avg_num_tokens_in_major_claim": 0,
    "avg_num_tokens_in_claims": 0,
    "avg_num_tokens_in_premises": 0,
    "10_most_specific_words_major_claim": 0,
    "10_most_specific_words_claims": 0,
    "10_most_specific_words_premises": 0,
}

counters = defaultdict(list)
args = ["major_claim", "claims", "premises"]

for essay in train_data:

    # straight forward counts
    stats["num_essays"] += 1
    stats["num_maj_claim"] += len(essay["major_claim"])
    stats["num_claims"] += len(essay["claims"])
    stats["num_premises"] += len(essay["premises"])
    stats["num_paras"] += len(essay["paragraphs"])
    stats["num_true_conf_bias"] += essay["confirmation_bias"]
    stats["num_false_conf_bias"] += not essay["confirmation_bias"]

    # inner loop for each paragraph
    for i in range(len(essay["paragraphs"])):
        is_sufficient = essay["paragraphs"][i]["sufficient"]
        stats["num_suff_paras"] += is_sufficient
        stats["num_insuff_paras"] += not is_sufficient

    # using spaCy's tokenizer and sentence maker
    stats["num_tokens"] += len(tokenizer(essay["text"]))
    stats["num_sentences"] += sum([1 for _ in sent(essay["text"]).sents])

    # loop over arguments to count average tokens per essay (later averaged again)
    for arg in args:
        num_tokens = 0
        counter = defaultdict(int)
        for element in essay[arg]:
            tokens = tokenizer(element["text"])
            num_tokens += len(tokens)
            for token in tokens:
                tkn = token.text.lower()
                contains_alpha = bool(re.match("[a-z]", tkn))
                # skip words that are stop words
                if tkn not in STOP_WORDS and contains_alpha:
                    counter[token.text] = element["text"].count(token.text)
        counters[arg].append(counter)
        stats[f"avg_num_tokens_in_{arg}"] += num_tokens / len(essay[arg])

else:

    # average them over all essays
    stats["avg_num_tokens_in_major_claim"] /= stats["num_essays"]
    stats["avg_num_tokens_in_claims"] /= stats["num_essays"]
    stats["avg_num_tokens_in_premises"] /= stats["num_essays"]

    # separate argument counters (each is a list of dictionaries)
    major_claim_wc = counters["major_claim"]
    claims_wc = counters["claims"]
    premises_wc = counters["premises"]

    # merge the list of dictionaries into one per argument for overall count
    mj_cntr = Counter()
    for d in major_claim_wc:
        mj_cntr.update(d)

    claims_cntr = Counter()
    for d in claims_wc:
        claims_cntr.update(d)

    premises_cntr = Counter()
    for d in premises_wc:
        premises_cntr.update(d)

    # calculate the top-10 specific words
    results = []
    for arg in args:
        res = top_n_specific(
            arg,
            mj_cntr=mj_cntr,
            claims_cntr=claims_cntr,
            premises_cntr=premises_cntr,
            n=10,
        )
        results.append(res)

        stats[f"10_most_specific_words_{arg}"] = list(res)

    # re-check no words are common in any two lists
    for i, j in itertools.combinations(range(len(results)), 2):
        num_common_elements = len(results[i].intersection(results[j]))
        assert num_common_elements == 0

### Display the answers

In [8]:
for k, v in stats.items():
    if isinstance(v, list):
        print(k, "=", sorted(v))
    elif isinstance(v, float):
        print(k, "=", f"{v:.3f}")
    else:
        print(k, "=", v)
    print()

num_essays = 322

num_maj_claim = 598

num_claims = 1202

num_premises = 3023

num_paras = 820

num_true_conf_bias = 122

num_false_conf_bias = 200

num_suff_paras = 538

num_insuff_paras = 282

num_tokens = 119752

num_sentences = 5531

avg_num_tokens_in_major_claim = 14.817

avg_num_tokens_in_claims = 15.149

avg_num_tokens_in_premises = 18.398

10_most_specific_words_major_claim = ['advantages', 'benefits', 'best', 'essential', 'government', 'live', 'lives', 'negative', 'prefer', 'technology']

10_most_specific_words_claims = ['future', 'human', 'living', 'number', 'problems', 'provide', 'skills', 'social', 'things', 'university']

10_most_specific_words_premises = ['countries', 'country', 'different', 'friends', 'high', 'like', 'lot', 'study', 'use', 'want']

