# PE Validation Fall 20

In [5]:
# import necessary modules 
# please run this everytime you open it
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import seaborn as sns
import re
import os

Task1:

In [2]:
# figure out a way to read in schema txt files (inside the schema folder)
# for example, I have a source file (such as Source_relevance.csv) and 
# I know the question_label, but I want to build a function to know 
# whether this question (Q1/Q2..) is multiple-choice or checklist
# build a function to accomplish this
# func(schemafile(...txt), input(..csv), question_label(Q1/Q2...)) -> checlist/multiple choice 

Task2: 

In [3]:
# not only do I want to know whether it's multiple-choice or checklist 
# I also want to know how many different answer choices are there
# build a function to accomplish this 
# func(schemafile(...txt), input(..csv), question_label(Q1/Q2...)) -> # of different answer choices

Task3:

In [4]:
# figure out how to uniquely identify each submission, 
# since there might be one contributor who made 
# several submissions. 
# can't use contributor_uuid. What to use? 

Task4: 

In [5]:
# one-hot-encode checklist questions to [0, 1]
# for example, Q4 in source relevance task is a checklist question
# if someone selects both A1 and A2, then it's [1, 1]
# return a dataframe that has each question_label but list out the answers in [] format. 

## Experimentation ground below: 

## Charlie

## Katherine

## Yewen

### Questions: 

In [6]:
# # read json lines file 
# data = pd.read_json('2020 data/2020-09-25_pe_webhooks.jsonl', lines=True)
# data.head()

In [7]:
# source.columns

In [8]:
# source[source["contributor_uuid"] == "00f548b7-6b63-4b47-828e-8e416b6ca0e2"][["contributor_uuid", "created", "finish_time", "elapsed_seconds", "question_label", "answer_label", "answer_uuid"]]

## Something to keep in mind!

**In Specialist csvs, one "quiz_task_uuid" can have multiple contributors. To uniquely identify each input, use "quiz_taskrun_uuid**

## Helper functions

In [15]:
# check if a certain column contains any duplicate values 
# return true if all values in the column are unique
def check_duplicate(df, col):
    # df - any dataframe 
    # col - column label 
    return df[col].is_unique

In [39]:
# count how many Nan rows are there in a df 
def count_nan_rows(df):
    return df.shape[0] - df.dropna(axis=0, how='all').shape[0]

In [47]:
# only select that value, useful for constructing pivot_table
def select(x):
    return x

## Specialist

In [58]:
# %load krippendorff.py
"""
This module provides a function to compute the Krippendorff's alpha statistical measure of the agreement achieved
when coding a set of units based on the values of a variable.

For more information, see: https://en.wikipedia.org/wiki/Krippendorff%27s_alpha

The module naming follows the one from the Wikipedia link.
"""
from typing import Any, Callable, Iterable, Optional, Sequence, Union

import numpy as np


def _nominal_metric(v1: np.ndarray, v2: np.ndarray, dtype: Any = np.float64, **kwargs) -> np.ndarray:  # noqa
    """Metric for nominal data."""
    return (v1 != v2).astype(dtype)


def _ordinal_metric(v1: np.ndarray, v2: np.ndarray, i1: np.ndarray, i2: np.ndarray,  # noqa
                    n_v: np.ndarray, dtype: Any = np.float64, **kwargs) -> np.ndarray:  # noqa
    """Metric for ordinal data."""
    i1, i2 = np.minimum(i1, i2), np.maximum(i1, i2)

    ranges = np.dstack((i1, i2 + 1))
    sums_between_indices = np.add.reduceat(np.append(n_v, 0), ranges.reshape(-1))[::2].reshape(*i1.shape)

    return (sums_between_indices - np.divide(n_v[i1] + n_v[i2], 2, dtype=dtype)) ** 2


def _interval_metric(v1: np.ndarray, v2: np.ndarray, dtype: Any = np.float64, **kwargs) -> np.ndarray:  # noqa
    """Metric for interval data."""
    return (v1 - v2).astype(dtype) ** 2


def _ratio_metric(v1: np.ndarray, v2: np.ndarray, dtype: Any = np.float64, **kwargs) -> np.ndarray:  # noqa
    """Metric for ratio data."""
    v1_plus_v2 = v1 + v2
    return np.divide(v1 - v2, v1_plus_v2, out=np.zeros(np.broadcast(v1, v2).shape), where=v1_plus_v2 != 0,
                     dtype=dtype) ** 2


def _coincidences(value_counts: np.ndarray, dtype: Any = np.float64) -> np.ndarray:
    """Coincidence matrix.

    Parameters
    ----------
    value_counts : ndarray, with shape (N, V)
        Number of coders that assigned a certain value to a determined unit, where N is the number of units
        and V is the value count.

    dtype : data-type
        Result and computation data-type.

    Returns
    -------
    o : ndarray, with shape (V, V)
        Coincidence matrix.
    """
    N, V = value_counts.shape
    pairable = np.maximum(value_counts.sum(axis=1), 2)
    diagonals = value_counts[:, np.newaxis, :] * np.eye(V)[np.newaxis, ...]
    unnormalized_coincidences = value_counts[..., np.newaxis] * value_counts[:, np.newaxis, :] - diagonals
    return np.divide(unnormalized_coincidences, (pairable - 1).reshape((-1, 1, 1)), dtype=dtype).sum(axis=0)


def _random_coincidences(n_v: np.ndarray, dtype: Any = np.float64) -> np.ndarray:
    """Random coincidence matrix.

    Parameters
    n_v : ndarray, with shape (V,)
        Number of pairable elements for each value.

    dtype : data-type
        Result and computation data-type.

    Returns
    -------
    e : ndarray, with shape (V, V)
        Random coincidence matrix.
    """
    return np.divide(np.outer(n_v, n_v) - np.diagflat(n_v), n_v.sum() - 1, dtype=dtype)


def _distances(value_domain: np.ndarray, distance_metric: Callable[..., np.ndarray], n_v: np.ndarray,
               dtype: Any = np.float64) -> np.ndarray:
    """Distances of the different possible values.

    Parameters
    ----------
    value_domain : ndarray, with shape (V,)
        Possible values V the units can take.
        If the level of measurement is not nominal, it must be ordered.

    distance_metric : callable
        Callable that return the distance of two given values.

    n_v : ndarray, with shape (V,)
        Number of pairable elements for each value.

    dtype : data-type
        Result and computation data-type.

    Returns
    -------
    d : ndarray, with shape (V, V)
        Distance matrix for each value pair.
    """
    indices = np.arange(len(value_domain))
    return distance_metric(value_domain[:, np.newaxis], value_domain[np.newaxis, :], i1=indices[:, np.newaxis],
                           i2=indices[np.newaxis, :], n_v=n_v, dtype=dtype)


def _distance_metric(level_of_measurement: Union[str, Callable[..., np.ndarray]]) -> Callable[..., np.ndarray]:
    """Distance metric callable of the level of measurement.

    Parameters
    ----------
    level_of_measurement : string or callable
        Steven's level of measurement of the variable.
        It must be one of 'nominal', 'ordinal', 'interval', 'ratio' or a callable.

    Returns
    -------
    metric : callable
        Distance callable.
    """
    return {
        'nominal': _nominal_metric,
        'ordinal': _ordinal_metric,
        'interval': _interval_metric,
        'ratio': _ratio_metric,
    }.get(level_of_measurement, level_of_measurement)


def _reliability_data_to_value_counts(reliability_data: np.ndarray, value_domain: np.ndarray) -> np.ndarray:
    """Return the value counts given the reliability data.

    Parameters
    ----------
    reliability_data : ndarray, with shape (M, N)
        Reliability data matrix which has the rate the i coder gave to the j unit, where M is the number of raters
        and N is the unit count.
        Missing rates are represented with `np.nan`.

    value_domain : ndarray, with shape (V,)
        Possible values the units can take.

    Returns
    -------
    value_counts : ndarray, with shape (N, V)
        Number of coders that assigned a certain value to a determined unit, where N is the number of units
        and V is the value count.
    """
    return (reliability_data.T[..., np.newaxis] == value_domain[np.newaxis, np.newaxis, :]).sum(axis=1)  # noqa


def alpha(reliability_data: Optional[Iterable[Any]] = None, value_counts: Optional[np.ndarray] = None,
          value_domain: Optional[Sequence[Any]] = None,
          level_of_measurement: Union[str, Callable[..., Any]] = 'interval', dtype: Any = np.float64) -> float:
    """Compute Krippendorff's alpha.

    See https://en.wikipedia.org/wiki/Krippendorff%27s_alpha for more information.

    Parameters
    ----------
    reliability_data : array_like, with shape (M, N)
        Reliability data matrix which has the rate the i coder gave to the j unit, where M is the number of raters
        and N is the unit count.
        Missing rates are represented with `np.nan`.
        If it's provided then `value_counts` must not be provided.

    value_counts : array_like, with shape (N, V)
        Number of coders that assigned a certain value to a determined unit, where N is the number of units
        and V is the value count.
        If it's provided then `reliability_data` must not be provided.

    value_domain : array_like, with shape (V,)
        Possible values the units can take.
        If the level of measurement is not nominal, it must be ordered.
        If `reliability_data` is provided, then the default value is the ordered list of unique rates that appear.
        Else, the default value is `list(range(V))`.

    level_of_measurement : string or callable
        Steven's level of measurement of the variable.
        It must be one of 'nominal', 'ordinal', 'interval', 'ratio' or a callable.

    dtype : data-type
        Result and computation data-type.

    Returns
    -------
    alpha : ndarray
        Scalar value of Krippendorff's alpha of type `dtype`.

    Examples
    --------
    >>> reliability_data = [[np.nan, np.nan, np.nan, np.nan, np.nan, 3, 4, 1, 2, 1, 1, 3, 3, np.nan, 3],
    ...                     [1, np.nan, 2, 1, 3, 3, 4, 3, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
    ...                     [np.nan, np.nan, 2, 1, 3, 4, 4, np.nan, 2, 1, 1, 3, 3, np.nan, 4]]
    >>> print(round(alpha(reliability_data=reliability_data, level_of_measurement='nominal'), 6))
    0.691358
    >>> print(round(alpha(reliability_data=reliability_data, level_of_measurement='interval'), 6))
    0.810845
    >>> value_counts = np.array([[1, 0, 0, 0],
    ...                          [0, 0, 0, 0],
    ...                          [0, 2, 0, 0],
    ...                          [2, 0, 0, 0],
    ...                          [0, 0, 2, 0],
    ...                          [0, 0, 2, 1],
    ...                          [0, 0, 0, 3],
    ...                          [1, 0, 1, 0],
    ...                          [0, 2, 0, 0],
    ...                          [2, 0, 0, 0],
    ...                          [2, 0, 0, 0],
    ...                          [0, 0, 2, 0],
    ...                          [0, 0, 2, 0],
    ...                          [0, 0, 0, 0],
    ...                          [0, 0, 1, 1]])
    >>> print(round(alpha(value_counts=value_counts, level_of_measurement='nominal'), 6))
    0.691358
    >>> # The following examples were extracted from
    >>> # https://www.statisticshowto.datasciencecentral.com/wp-content/uploads/2016/07/fulltext.pdf, page 8.
    >>> reliability_data = [[1, 2, 3, 3, 2, 1, 4, 1, 2, np.nan, np.nan, np.nan],
    ...                     [1, 2, 3, 3, 2, 2, 4, 1, 2, 5, np.nan, 3],
    ...                     [np.nan, 3, 3, 3, 2, 3, 4, 2, 2, 5, 1, np.nan],
    ...                     [1, 2, 3, 3, 2, 4, 4, 1, 2, 5, 1, np.nan]]
    >>> print(round(alpha(reliability_data, level_of_measurement='ordinal'), 3))
    0.815
    >>> print(round(alpha(reliability_data, level_of_measurement='ratio'), 3))
    0.797
    >>> reliability_data = [["very low", "low", "mid", "mid", "low", "very low", "high", "very low", "low", np.nan,
    ...                      np.nan, np.nan],
    ...                     ["very low", "low", "mid", "mid", "low", "low", "high", "very low", "low", "very high",
    ...                      np.nan, "mid"],
    ...                     [np.nan, "mid", "mid", "mid", "low", "mid", "high", "low", "low", "very high", "very low",
    ...                      np.nan],
    ...                     ["very low", "low", "mid", "mid", "low", "high", "high", "very low", "low", "very high",
    ...                      "very low", np.nan]]
    >>> print(round(alpha(reliability_data, level_of_measurement='ordinal',
    ...                   value_domain=["very low", "low", "mid", "high", "very high"]), 3))
    0.815
    """
    if (reliability_data is None) == (value_counts is None):
        raise ValueError("Either reliability_data or value_counts must be provided, but not both.")

    # Don't know if it's a list or numpy array. If it's the latter, the truth value is ambiguous. So, ask for None.
    if value_counts is None:
        reliability_data = np.asarray(reliability_data)

        if value_domain is None:
#             value_domain = np.unique(reliability_data[~np.isnan(reliability_data)])
            value_domain = np.unique(reliability_data[~pd.isnull(reliability_data)])
        else:
            value_domain = np.asarray(value_domain)
            assert np.isin(reliability_data, np.append(value_domain, np.nan)).all(), \
                "The reliability data contains out-of-domain values."

        value_counts = _reliability_data_to_value_counts(reliability_data, value_domain)
    else:  # elif reliability_data is None
        value_counts = np.asarray(value_counts)

        if value_domain is None:
            value_domain = np.arange(value_counts.shape[1])
        else:
            value_domain = np.asarray(value_domain)
            assert value_counts.shape[1] == len(value_domain), \
                "The value domain should be equal to the number of columns of value_counts."

    assert len(value_domain) > 1, "There has to be more than one value in the domain."

    distance_metric = _distance_metric(level_of_measurement)

    o = _coincidences(value_counts, dtype=dtype)
    n_v = o.sum(axis=0)
    e = _random_coincidences(n_v, dtype=dtype)
    d = _distances(value_domain, distance_metric, n_v, dtype=dtype)
    return 1 - (o * d).sum() / (e * d).sum()


In [16]:
# load the example data 
relevance = pd.read_csv("Specialists data/Covid_SourceRelevancev1-2020-10-04T1838-DataHunt.csv", encoding="escape-")

In [17]:
relevance.columns

Index(['namespace', 'schema_sha256', 'quiz_task_uuid', 'task_url', 'tua_uuid',
       'article_batch_name', 'article_number', 'article_filename',
       'article_sha256', 'article_text_length', 'destination',
       'task_redundancy', 'taskrun_count', 'quiz_taskrun_uuid',
       'contributor_uuid', 'created', 'finish_time', 'elapsed_seconds',
       'topic_name', 'question_label', 'question_text', 'answer_label',
       'answer_content', 'answer_uuid', 'submitted_tua_uuid', 'answer_text',
       'case_number', 'highlight_count', 'start_pos', 'end_pos',
       'target_text'],
      dtype='object')

**Question 2: "Does the experience/knowledge/position of the quoted source qualify them to speak or write such claims?" is a categorical question**

In [18]:
relevance["question_label"].unique()

array(['T1.Q1', 'T1.Q2', 'T1.Q3', 'T1.Q5', 'T1.Q6', 'T1.Q7', 'T1.Q8',
       'T1.Q9', 'T1.Q10', 'T1.Q4'], dtype=object)

In [23]:
relevance_q2 = relevance[relevance["question_label"] == "T1.Q2"]

In [27]:
# select certain columns 
cols = ["quiz_taskrun_uuid", "contributor_uuid", "question_label", "answer_label"]
relevance_q2 = relevance_q2.reindex(columns=cols)
relevance_q2.head()

Unnamed: 0,quiz_taskrun_uuid,contributor_uuid,question_label,answer_label
1,aebae1f3-707a-4145-9fc3-7c5cf79bf0b8,00f548b7-6b63-4b47-828e-8e416b6ca0e2,T1.Q2,T1.Q2.A1
18,fee75365-83d0-4c47-a520-73e0954af82c,e44f5799-6915-45fd-874e-060f6afedcb9,T1.Q2,T1.Q2.A2
40,4ee3886b-6e92-44a5-861f-7ffe2f3ad1df,fed45769-aa61-4b8b-847e-a9c799acce15,T1.Q2,T1.Q2.A2
53,8b229589-f037-4464-85cd-609de66d0612,fb4ae4cc-234b-484a-b541-a8e3a9254a24,T1.Q2,T1.Q2.A1
73,fc945f0d-ecb8-49a2-8e87-90a83efe543e,3bb03f58-5f09-4414-b54e-43a814b9b09a,T1.Q2,T1.Q2.A1


**Construct the reliability matrix, the left column is the contributor_uuid, each unit -> quiz_taskrun_uuid**

In [70]:
df = pd.pivot_table(relevance_q2, values='answer_label', index='contributor_uuid', columns='quiz_taskrun_uuid', aggfunc=select)
# drop nan rows 
df = df.dropna(axis=1, how='all')
df

quiz_taskrun_uuid,00279610-b46d-4404-9037-3845373e9b62,0158e65d-c960-4565-96ab-574a5fcb750f,036dcfc5-d989-4d1a-a3ea-dcfe2beafd5c,0408ab23-08c4-44f7-84d0-346001d59e93,0734c61c-0448-461b-8dde-7b6abd8003d8,087e32bc-9701-42ef-835c-5b7ba62ce11d,0945a43f-7780-4628-87c4-67d3b0bd3981,0c0adb65-8241-4890-81eb-7e3bece8f0bc,0d209bb1-1795-44f4-8304-de766e94aade,0f0db5e0-540b-4ab7-9062-35170c03320b,...,fc5b9d46-43e7-4472-a112-23a72620a4e3,fc945f0d-ecb8-49a2-8e87-90a83efe543e,fcfd8244-3230-4ea7-ab49-3e49ab70ade9,fd487eb7-857e-4f14-b42f-ea45e67f7d1c,fd49c6a4-9033-47b1-974a-319ec93be21c,fdfb77fa-196e-4aa4-8d07-949d92a1b25c,fe3278b6-e072-46c2-ada3-3bbd148f13f6,fe552cc8-8090-4538-adf7-d963ff1de5a1,fee75365-83d0-4c47-a520-73e0954af82c,ffd3215a-8fc3-4ac7-a59a-0f9c43be2683
contributor_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00f548b7-6b63-4b47-828e-8e416b6ca0e2,,,,T1.Q2.A5,,,,,,,...,,,,,,,,,,
06e1eba7-c695-4495-85a7-2fb33ef9412c,,,,,,,,,,,...,,,,,,,,,,
0b2ee377-65f2-4ac1-98da-5bc093fee7aa,,,,,,,,,,,...,,,,,,,,,,
0cdf31d0-1966-4f69-a0c8-c483788b5256,,,,,,,,,,,...,,,,,,,,,,
0e4b8a3e-2714-4b5f-ba10-bb576aaa5cc6,,,,,,,,,,,...,,,,,,,,,,
0e81b339-ba89-48a6-b6eb-3160c5ac61db,,,,,,,,,,,...,,,,,,,,,,
0f369711-c76f-497a-bffe-4d10a3ab3619,,,,,,,,,,,...,,,,,,,,,,
172e9a38-d3e4-453e-8556-18bbd0fcb07e,,,,,,,,,,,...,,,,,,,,,,
23338303-8192-4d70-8e03-508a7120d1c3,,,,,,,,,,,...,,,,,,,,,,
28740234-10c8-4421-b56a-c7150e000d48,,,,,,,,,,,...,,,,,,,,,,


In [71]:
array = df.to_numpy()
array[0]

array([nan, nan, nan, 'T1.Q2.A5', nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, 'T1.Q2.A2', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       'T1.Q2.A2', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, 'T1.Q2.A5', nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'T1.Q2.A5',
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, 'T1.Q2.A5', nan, nan,
       'T1.Q2.A1', 'T1.Q2.A5', nan, nan, nan, nan, nan, n

In [72]:
array.shape

(47, 258)

In [73]:
pd.isnull('T1.Q2.A1')

False

In [74]:
pd.isnull(np.nan)

True

In [75]:
'T1.Q2.A1'[-1]

'1'

In [76]:
# iterate through all values in array and convert string to int 
for i in range(array.shape[0]):
    for j in range(array.shape[1]):
        cur = array[i][j]
        # if the current value is a string
        if not pd.isnull(cur):
            array[i][j] = int(array[i][j][-1])

In [77]:
array[0]

array([nan, nan, nan, 5, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, 2, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, 5, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, 5, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 5, nan, nan,
       1, 5, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan

In [78]:
print(round(alpha(array, level_of_measurement='nominal'), 3))

nan




## Triager to uAlpha

In [9]:
# create mapping for topic_name
dic = {"Quoted Sources":1,
           "Arguments":2,
           "Assertions":3, 
           "Needs Fact-Check":4, 
           "Language":5, 
           "Reasoning":6, 
           "Probability":7, 
           "Evidence":8}

    
# mapping topic_name to integers 
def mapping(df, dic):
    # df - dataframe to be mapped 
    # dic - dictionary of mapping 
    df["topic_name"] = df["topic_name"].map(dic)
    return df 


# filter df and select only certain cols as input
def slice_input(df):
    cols = ["contributor_uuid", "topic_name", "start_pos",
            "end_pos", "article_text_length", "article_number", "created"]
    return df.reindex(columns=cols)


# filter df and select only certain cols as output
def slice_output(df):
    cols = ["contributor_uuid", "topic_name", "blank",  
            "start_pos_adjusted", "end_pos_adjusted"]
    return df.reindex(columns=cols)
    


# read in a directory that contains Triager csvs 
# split each Triager csv to 4 other csvs based on topic_name (argument, reasoning, etc)
# and write it as csvs in the output directory
def triager_split(input_directory, output_directory):
    # input_directory - e.g. "Triager data"
    # output_directory - e.g. "Triager output"
    for file in os.listdir(input_directory):
        triager_split_help(input_directory, file, output_directory)
    print("Triager Tranformation Done! Ready to be imported to uAlpha!")

In [10]:
# resolve overlapping issues by merging overlapping rows!!!
def resolve_overlapping(df):
    # df - a dataframe with 5 cols {0: index, 1: user_id, 2:topic_name(as int), 3:blank, 4:start_pos, 5:end_pos}
    # this function only works when start_pos are sorted for each user!!!
    created = df["created"].unique()
    # iterate through all unique user 
    for time in created:
        # start and end index of that user 
        start_i = df[df["created"] == time].index.values[0]
        end_i = df[df["created"] == time].index.values[-1]
        # iterate through the rows of that user 
        i = start_i
        while i < end_i:
            # if the start_pos of next row > end_pos of this row
            # no overlap
            # 5 - the col of end_pos
            # the 10th column is end_pos_adjusted
            end_pos = df.iat[i, 10]
            # the 9th column is start_pos_adjusted
            start_pos = df.iat[i+1, 9]
            if start_pos > end_pos:
                i += 1 
            else:
                # if the article_number is the same, doesn't really matter 
                # modify the end_pos of this row, merge 
                if df.iat[i, 10] < df.iat[i+1 , 10]:
                    df.iat[i, 10] = df.iat[i+1, 10].copy()
                # drop the next row 
                df.drop(i+1, inplace=True)
                # reindex
                df.reset_index(drop=True, inplace=True)
                # decrement end_i 
                end_i -= 1
    return df

In [11]:
# add adjusted start_pos and adjusted end_pos for df 
def adjust(df):
    # df - a dataframe with ["article_number", "article_text_length", "start_pos", "end_pos"] cols
    article_number = df["article_number"]
    article_text_length = df["article_text_length"]
    start_pos = df["start_pos"]
    end_pos = df["end_pos"]
    # the cumulative sum of article_text_length
    cumulative = []
    start_pos_adjusted = []
    end_pos_adjusted = []
    pre_total = 0
    total = article_text_length[0]
    cur_num = article_number[0]
    for i in np.arange(len(article_number)):
        if article_number[i] == cur_num:
            total = total + 0
            cumulative.append(total)
        else:
            pre_total = total
            cur_num = article_number[i]
            total = total + article_text_length[i]
            cumulative.append(total)
        start_pos_adjusted.append(start_pos[i] + pre_total)
        end_pos_adjusted.append(end_pos[i] + pre_total)
    df["cumulative"] = cumulative
    df["start_pos_adjusted"] = start_pos_adjusted
    df["end_pos_adjusted"] = end_pos_adjusted
    return df 

In [12]:
# take in a Triager csv and split it to 4 different csvs 
# based on topic_name 
def triager_split_help(input_directory, fileName, output_directory):
    # input_directory - e.g. "Triager data"
    # fileName - e.g. "Covid_Form1.0.adjudicated-2020-10-04T2314-Tags.csv"
    # output_directory - e.g. "Triager output"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name)
    # different topic_names 
    topic_names = df["topic_name"].unique()
    for topic in topic_names: 
        # select all rows for that specific topic name 
        df_topic = df.loc[df["topic_name"] == topic]
        # select only these columns 
        filtered = slice_input(df_topic)
        # slice contributor_uuid with the first 6 chars 
        filtered["contributor_uuid"] = filtered["contributor_uuid"].str[:6]
        # map topic_name to integers
        mapping(filtered, dic)
        # add a blank column 
        filtered.insert(loc=3, column="blank", value="")
        # sort
        filtered = filtered.sort_values(by=["article_number", "contributor_uuid", "start_pos"])
        # re-index such that its index starts from 0 again
        filtered.reset_index(drop=True, inplace=True)
        # add modified columns
        filtered = adjust(filtered)
        # resolve overlapping issues 
        filtered = resolve_overlapping(filtered)
        # filter and select only certain columns 
        output = slice_output(filtered)
        # sort again by contributor
        output = output.sort_values(by=["contributor_uuid", "start_pos_adjusted"])
        # add one column of u + str(row) and set it as index 
        index = 'u' + pd.Series(filtered.index).astype(str)
        output.set_index(keys=index, inplace=True)
        # the batch_name, e.g. "Covid"
        batch_name = re.split(r'_', fileName)[0]
        name = '{0}-Triager-{1}.csv'.format(batch_name, topic)
        path = os.path.join(output_directory, name)
        # write to csvs
        output.to_csv(path, header=False)
    return None

In [13]:
triager_split("Triager data", "Triager output")

Triager Tranformation Done! Ready to be imported to uAlpha!


**Example of resolving overlap**

In [14]:
a = pd.read_csv("example.csv")
a.drop(a.columns[0], axis=1, inplace=True)
a

Unnamed: 0,contributor_uuid,topic_name,start_pos,blank,end_pos,article_text_length,article_number,created,cumulative,start_pos_adjusted,end_pos_adjusted
0,aac18e,2,136,,287,1870,100054,2020-03-18 00:15:42.364545,5755,4021,4172
1,aac18e,2,563,,776,1870,100054,2020-03-18 00:15:42.364545,5755,4448,4661
2,aac18e,2,651,,667,1870,100054,2020-03-18 00:15:42.364545,5755,4536,4552


In [15]:
a = resolve_overlapping(a)
a

Unnamed: 0,contributor_uuid,topic_name,start_pos,blank,end_pos,article_text_length,article_number,created,cumulative,start_pos_adjusted,end_pos_adjusted
0,aac18e,2,136,,287,1870,100054,2020-03-18 00:15:42.364545,5755,4021,4172
1,aac18e,2,563,,776,1870,100054,2020-03-18 00:15:42.364545,5755,4448,4661


In [16]:
# debugging function - find potential overlapping issue 
def detect_overlapping(input_directory, fileName):
    # input directory - e.g. "Triager output"
    # fileName - e.g. "Covid-Triager-Arguments.csv"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name, header=None)
    # all the unique user 
    users = df.iloc[:, 1].unique()
    # iterate through all unique user 
    for user in users:
        # first only select the rows of that user 
        # use copy to deal with the wanring
        piece = df.loc[df[1] == user].copy()
        # sort the 4th column
        piece.sort_values(by=4, inplace=True)
        # find or not
        found = False 
        start_pos = np.array(piece[4])
        end_pos = np.array(piece[5])
        for i in np.arange(1, len(start_pos)):
            if start_pos[i] > end_pos[i-1]:
                continue
            else:
                found = True
                print('user {0}, start_pos {1}, end_pos {2} begin overlapping!'.format(user, start_pos[i], end_pos[i]))
                break 
    return None 

In [17]:
detect_overlapping("Triager output", "Covid-Triager-Arguments.csv")

**Example of removing a row**

In [18]:
a = pd.DataFrame(index=np.array([23, 24, 25, 26, 27]), data={"first": [1, 2, 3, 4, 5]})
a

Unnamed: 0,first
23,1
24,2
25,3
26,4
27,5


In [19]:
a.drop(25, inplace=True)
a

Unnamed: 0,first
23,1
24,2
26,4
27,5
