In [25]:
# import necessary modules 
# please run this everytime you open it
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import seaborn as sns
import re
import os

## Something to keep in mind!

**In Specialist csvs, one "quiz_task_uuid" can have multiple contributors. To uniquely identify each input, use "quiz_taskrun_uuid**

## Helper functions

In [6]:
# mapping for Triager tasks
triager_dic = {"Quoted Sources":1,
               "Arguments":2,
               "Assertions":3, 
               "Needs Fact-Check":4, 
               "Language":5, 
               "Reasoning":6, 
               "Probability":7, 
               "Evidence":8}

In [16]:
# dic for Specialists tasks
specialist_dic = {"Argument relevance": {"mc": {"categorical":[1], "ordinal":[2, 3]}, "tx":[4]},
                  "Evidence Specialist": {"mc":{"categorical":[6, 9], "ordinal":[4, 5, 8, 10, 11, 12, 14, 15]}, "cl":[1, 2], "tx":[3, 7, 16]},
                  "Holistic Evaluation of Article": {"mc":{"categorical":[1, 5, 14], "ordinal":[7, 8, 11, 12, 13, 17, 18]}, "cl":[3, 6, 9, 10, 15], "tx":[2, 4, 16, 19]}, 
                  "Language Specialist V4": {"mc":{"categorical":[], "ordinal":[2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15]}, "cl":[1, 6], "tx":[7, 16]},
                  "Probability Specialist": {"mc": {"categorical":[6], "ordinal":[1, 2, 4, 5, 7, 8, 9, 10, 11, 14, 15]}, "cl":[12], "tx":[16]},
                  "Reasoning Specialist": {},
                  "Quote Source Relevance": 7}

In [15]:
# check if a certain column contains any duplicate values 
# return true if all values in the column are unique
def check_duplicate(df, col):
    # df - any dataframe 
    # col - column label 
    return df[col].is_unique

In [23]:
# count how many Nan rows are there in a df 
# Nan rows are rows that have all missing values
def count_nan_rows(df):
    return df.shape[0] - df.dropna(axis=0, how='all').shape[0]

In [22]:
# count how many nan cols are there in a df
# nan cols are cols where all values are missing 
def count_nan_cols(df):
    return df.shape[1] - df.dropna(axis=1, how="all").shape[1]

In [19]:
# only select that value, useful for constructing pivot_table
def select(x):
    return x

In [12]:
# return the unique values sorted in a df's column
def unique(df, x):
    # df - dataframe 
    # x - the label of the column 
    a = sorted(df[x].unique())
    return a

In [28]:
# convert all strings in the array to int 
# e.g. "T1.Q1.A5" -> 5 
def to_int(array):
    # array - e.g. 2-d array where some entries are "T1.Q1.A3"
    for i in range(array.shape[0]):
        for j in range(array.shape[1]):
            cur = array[i][j]
            # if the current value is a string
            if not pd.isnull(cur):
                array[i][j] = int(array[i][j][-1])
    df = pd.DataFrame(array)
    display(df.head())
    return array

In [23]:
# construct a reliability matrix from Specialist csv 
def to_reliability(input_directory, fileName, question_label):
    # input_directory - e.g. "Specialists data"
    # fileName - e.g. "Covid_SourceRelevancev1-2020-10-04T1838-DataHunt.csv"
    # question_label - e.g. "Q2"
    name = os.path.join(input_directory, fileName)
    # read in csv 
    df = pd.read_csv(name)
    # select only that question_label 
    df = df[df["question_label"] == question_label].copy()
    # slice 
    cols = ["quiz_taskrun_uuid", "contributor_uuid", "question_label", "answer_label"]
    df = df.reindex(columns=cols)
    # construct a pivot_table 
    df = pd.pivot_table(df, values='answer_label', index='contributor_uuid', columns='quiz_taskrun_uuid', aggfunc=select)
    # print # of missing rows and cols 
    print("There are {0} rows where all values are missing".format(count_nan_rows(df)))
    print("There are {0} cols where all values are missing".format(count_nan_cols(df)))
    # to numpy array
    array = df.to_numpy()
    # convert string to int 
    array = to_int(array)
    # return the array 
    return array

In [10]:
# extract topic name (Argument/Holistic...) from df 
def get_topic(df):
    # df - e.g. df after reading in "Covid_Evidencev1-2020-10-04T1836-DataHunt.csv"
    # return the first one coz that's fast than unique()
    return df["topic_name"][0]

In [17]:
# compute alpha values 
def compute_alpha(input_directory, fileName, question_label, measurement, decimal_places):
    # input_directory - e.g. "Specialist data"
    # fileName - e.g. "Covid_ArgumentRelevancev1.2-2020-10-04T1802-DataHunt.csv"
    # question _label - e.g. "T1.Q2"
    # measurement - "nominal", "ordinal", "interval"
    # decimal_plaes - number of decimal places to be rounded 
    array = to_reliability(input_directory, fileName, question_label)
    return np.round(alpha(reliability_data=array, level_of_measurement=measurement), decimal_places)

## Specialist Tasks

In [33]:
# import krippendorff.py
%run krippendorff.py

## Triager to uAlpha

In [1]:
# mapping topic_name to integers 
def mapping(df, dic):
    # df - dataframe to be mapped 
    # dic - dictionary of mapping 
    df["topic_name"] = df["topic_name"].map(dic)
    return df 


# filter df and select only certain cols as input
def slice_input(df):
    cols = ["contributor_uuid", "topic_name", "start_pos",
            "end_pos", "article_text_length", "article_number", "created"]
    return df.reindex(columns=cols)


# filter df and select only certain cols as output
def slice_output(df):
    cols = ["contributor_uuid", "topic_name", "blank",  
            "start_pos_adjusted", "end_pos_adjusted"]
    return df.reindex(columns=cols)
    


# read in a directory that contains Triager csvs 
# split each Triager csv to 4 other csvs based on topic_name (argument, reasoning, etc)
# and write it as csvs in the output directory
def triager_split(input_directory, output_directory):
    # input_directory - e.g. "Triager data"
    # output_directory - e.g. "Triager output"
    for file in os.listdir(input_directory):
        triager_split_help(input_directory, file, output_directory)
    print("Triager Tranformation Done! Ready to be imported to uAlpha!")

In [10]:
# resolve overlapping issues by merging overlapping rows!!!
def resolve_overlapping(df):
    # df - a dataframe with 5 cols {0: index, 1: user_id, 2:topic_name(as int), 3:blank, 4:start_pos, 5:end_pos}
    # this function only works when start_pos are sorted for each user!!!
    created = df["created"].unique()
    # iterate through all unique user 
    for time in created:
        # start and end index of that user 
        start_i = df[df["created"] == time].index.values[0]
        end_i = df[df["created"] == time].index.values[-1]
        # iterate through the rows of that user 
        i = start_i
        while i < end_i:
            # if the start_pos of next row > end_pos of this row
            # no overlap
            # 5 - the col of end_pos
            # the 10th column is end_pos_adjusted
            end_pos = df.iat[i, 10]
            # the 9th column is start_pos_adjusted
            start_pos = df.iat[i+1, 9]
            if start_pos > end_pos:
                i += 1 
            else:
                # if the article_number is the same, doesn't really matter 
                # modify the end_pos of this row, merge 
                if df.iat[i, 10] < df.iat[i+1 , 10]:
                    df.iat[i, 10] = df.iat[i+1, 10].copy()
                # drop the next row 
                df.drop(i+1, inplace=True)
                # reindex
                df.reset_index(drop=True, inplace=True)
                # decrement end_i 
                end_i -= 1
    return df

In [11]:
# add adjusted start_pos and adjusted end_pos for df 
def adjust(df):
    # df - a dataframe with ["article_number", "article_text_length", "start_pos", "end_pos"] cols
    article_number = df["article_number"]
    article_text_length = df["article_text_length"]
    start_pos = df["start_pos"]
    end_pos = df["end_pos"]
    # the cumulative sum of article_text_length
    cumulative = []
    start_pos_adjusted = []
    end_pos_adjusted = []
    pre_total = 0
    total = article_text_length[0]
    cur_num = article_number[0]
    for i in np.arange(len(article_number)):
        if article_number[i] == cur_num:
            total = total + 0
            cumulative.append(total)
        else:
            pre_total = total
            cur_num = article_number[i]
            total = total + article_text_length[i]
            cumulative.append(total)
        start_pos_adjusted.append(start_pos[i] + pre_total)
        end_pos_adjusted.append(end_pos[i] + pre_total)
    df["cumulative"] = cumulative
    df["start_pos_adjusted"] = start_pos_adjusted
    df["end_pos_adjusted"] = end_pos_adjusted
    return df 

In [12]:
# take in a Triager csv and split it to 4 different csvs 
# based on topic_name 
def triager_split_help(input_directory, fileName, output_directory):
    # input_directory - e.g. "Triager data"
    # fileName - e.g. "Covid_Form1.0.adjudicated-2020-10-04T2314-Tags.csv"
    # output_directory - e.g. "Triager output"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name)
    # different topic_names 
    topic_names = df["topic_name"].unique()
    for topic in topic_names: 
        # select all rows for that specific topic name 
        df_topic = df.loc[df["topic_name"] == topic]
        # select only these columns 
        filtered = slice_input(df_topic)
        # slice contributor_uuid with the first 6 chars 
        filtered["contributor_uuid"] = filtered["contributor_uuid"].str[:6]
        # map topic_name to integers
        mapping(filtered, triager_dic)
        # add a blank column 
        filtered.insert(loc=3, column="blank", value="")
        # sort
        filtered = filtered.sort_values(by=["article_number", "contributor_uuid", "start_pos"])
        # re-index such that its index starts from 0 again
        filtered.reset_index(drop=True, inplace=True)
        # add modified columns
        filtered = adjust(filtered)
        # resolve overlapping issues 
        filtered = resolve_overlapping(filtered)
        # filter and select only certain columns 
        output = slice_output(filtered)
        # sort again by contributor
        output = output.sort_values(by=["contributor_uuid", "start_pos_adjusted"])
        # add one column of u + str(row) and set it as index 
        index = 'u' + pd.Series(filtered.index).astype(str)
        output.set_index(keys=index, inplace=True)
        # the batch_name, e.g. "Covid"
        batch_name = re.split(r'_', fileName)[0]
        name = '{0}-Triager-{1}.csv'.format(batch_name, topic)
        path = os.path.join(output_directory, name)
        # write to csvs
        output.to_csv(path, header=False)
    return None

In [16]:
# debugging function - find potential overlapping issue 
def detect_overlapping(input_directory, fileName):
    # input directory - e.g. "Triager output"
    # fileName - e.g. "Covid-Triager-Arguments.csv"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name, header=None)
    # all the unique user 
    users = df.iloc[:, 1].unique()
    # iterate through all unique user 
    for user in users:
        # first only select the rows of that user 
        # use copy to deal with the wanring
        piece = df.loc[df[1] == user].copy()
        # sort the 4th column
        piece.sort_values(by=4, inplace=True)
        # find or not
        found = False 
        start_pos = np.array(piece[4])
        end_pos = np.array(piece[5])
        for i in np.arange(1, len(start_pos)):
            if start_pos[i] > end_pos[i-1]:
                continue
            else:
                found = True
                print('user {0}, start_pos {1}, end_pos {2} begin overlapping!'.format(user, start_pos[i], end_pos[i]))
                break 
    return None 