# Setup

## Function Setup

In [1]:
from bs4 import BeautifulSoup, Tag, NavigableString
from collections import defaultdict
import re
from itertools import islice
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from googletrans import Translator
import ast

In [2]:
def unique_words(sentences):
    # Join all sentences into one string
    joined_text = ' '.join(sentences)

    # Split the string into words
    words = joined_text.split()

    # Use a set to find unique words
    unique_words = set(words)

    # Print unique words
    for word in unique_words:
        print(word)

## Data Setup

In [3]:
article_list =['protected','excellent','readworthy','random']
whole_dict ={}
for origin in article_list:
    whole_dict[origin] ={}
    whole_dict[origin]['titles'] = pd.read_csv(
        f'data/data_files/pipeline_steps/{origin}_articles/0_{origin}_titles.csv')
    whole_dict[origin]['df'] = pd.read_csv(
        f'data/data_files/pipeline_steps/{origin}_articles/7.1_maskadded.csv')

# Results

In [4]:
for key, value in whole_dict.items():
    df = value['df']
    # defining the corresponding column pairs
    column_pairs = [("quot_label", "pos_nonresolved_quot_filter"),
                    ("link_label", "pos_nonresolved_link_filter"),
                    ("namelink_label", "pos_nonresolved_namelink_filter")]

    # initializing a list to store the results
    results = []

    # iterating over the column pairs
    for col_a, col_b in column_pairs:
        # count the number of rows where col_a is False and col_b is True
        count_true = df[(df[col_a] == False) & (df[col_b] == True)].shape[0]
        # count the number of rows where either col_a is not False or col_b is not True
        count_false = df.shape[0] - count_true
        # append the results to the list
        share_true = df[col_a].mean()
        # drop rows that match the condition and calculate the share of True values in the remaining rows
        count_a_true_b_true = df[(df[col_a] == True) & (df[col_b] == True)].shape[0]
        share_true_after_drop = df.drop(df[(df[col_a] == False) & (df[col_b] == True)].index)[col_a].mean()
        unique_pos_nonresolved_text = df[(df[col_a] == True)
                                         & (df[col_b] == True)]['pos_nonresolved_text'].nunique()
        results.append([
            col_a, df.shape[0], count_true, count_false, (count_true / df.shape[0]), share_true,
            share_true_after_drop, count_a_true_b_true, unique_pos_nonresolved_text
        ])

    # create a DataFrame from the results
    results_df = pd.DataFrame(results,
                            columns=["Label", "Total sentences", "Filtered sentences", "Remaining sentences",'Share filtered','Postive Rate before','Positive rate after','No. matching POS-seqs','Unique POS'])

    # print the results DataFrame
    print(key)
    display(results_df)

protected


Unnamed: 0,Label,Total sentences,Filtered sentences,Remaining sentences,Share filtered,Postive Rate before,Positive rate after,No. matching POS-seqs,Unique POS
0,quot_label,289843,20971,268872,0.072353,0.251412,0.271021,7289,3924
1,link_label,289843,19235,270608,0.066364,0.62273,0.666994,16064,5829
2,namelink_label,289843,16463,273380,0.0568,0.741432,0.786082,18374,5679


excellent


Unnamed: 0,Label,Total sentences,Filtered sentences,Remaining sentences,Share filtered,Postive Rate before,Positive rate after,No. matching POS-seqs,Unique POS
0,quot_label,590756,58069,532687,0.098296,0.234428,0.259984,17404,9567
1,link_label,590756,54588,536168,0.092404,0.553379,0.609719,36996,14253
2,namelink_label,590756,45896,544860,0.07769,0.704541,0.763888,45699,14149


readworthy


Unnamed: 0,Label,Total sentences,Filtered sentences,Remaining sentences,Share filtered,Postive Rate before,Positive rate after,No. matching POS-seqs,Unique POS
0,quot_label,651668,66078,585590,0.101398,0.214348,0.238535,18439,10145
1,link_label,651668,62417,589251,0.09578,0.563186,0.622842,42822,15833
2,namelink_label,651668,52707,598961,0.08088,0.702017,0.763793,52078,15582


random


Unnamed: 0,Label,Total sentences,Filtered sentences,Remaining sentences,Share filtered,Postive Rate before,Positive rate after,No. matching POS-seqs,Unique POS
0,quot_label,707478,86016,621462,0.121581,0.185111,0.210732,22566,11510
1,link_label,707478,73950,633528,0.104526,0.601893,0.67215,61370,18596
2,namelink_label,707478,64617,642861,0.091334,0.693853,0.763596,68620,17909
