# PE Validation Fall 20

In [1]:
# import necessary modules 
# please run this everytime you open it
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import seaborn as sns
import re
import os

Task1:

In [2]:
# figure out a way to read in schema txt files (inside the schema folder)
# for example, I have a source file (such as Source_relevance.csv) and 
# I know the question_label, but I want to build a function to know 
# whether this question (Q1/Q2..) is multiple-choice or checklist
# build a function to accomplish this
# func(schemafile(...txt), input(..csv), question_label(Q1/Q2...)) -> checlist/multiple choice 

Task2: 

In [3]:
# not only do I want to know whether it's multiple-choice or checklist 
# I also want to know how many different answer choices are there
# build a function to accomplish this 
# func(schemafile(...txt), input(..csv), question_label(Q1/Q2...)) -> # of different answer choices

Task3:

In [4]:
# figure out how to uniquely identify each submission, 
# since there might be one contributor who made 
# several submissions. 
# can't use contributor_uuid. What to use? 

Task4: 

In [5]:
# one-hot-encode checklist questions to [0, 1]
# for example, Q4 in source relevance task is a checklist question
# if someone selects both A1 and A2, then it's [1, 1]
# return a dataframe that has each question_label but list out the answers in [] format. 

## Experimentation ground below: 

## Charlie

## Katherine

## Yewen

### Questions: 

In [6]:
# # read json lines file 
# data = pd.read_json('2020 data/2020-09-25_pe_webhooks.jsonl', lines=True)
# data.head()

In [7]:
# source.columns

In [8]:
# source[source["contributor_uuid"] == "00f548b7-6b63-4b47-828e-8e416b6ca0e2"][["contributor_uuid", "created", "finish_time", "elapsed_seconds", "question_label", "answer_label", "answer_uuid"]]

## Triager to uAlpha

In [9]:
# create mapping for topic_name
dic = {"Quoted Sources":1,
           "Arguments":2,
           "Assertions":3, 
           "Needs Fact-Check":4, 
           "Language":5, 
           "Reasoning":6, 
           "Probability":7, 
           "Evidence":8}

    
# mapping topic_name to integers 
def mapping(df, dic):
    # df - dataframe to be mapped 
    # dic - dictionary of mapping 
    df["topic_name"] = df["topic_name"].map(dic)
    return df 


# filter df and select only certain cols as input
def slice_input(df):
    cols = ["contributor_uuid", "topic_name", "start_pos",
            "end_pos", "article_text_length", "article_number", "created"]
    return df.reindex(columns=cols)


# filter df and select only certain cols as output
def slice_output(df):
    cols = ["contributor_uuid", "topic_name", "blank",  
            "start_pos_adjusted", "end_pos_adjusted"]
    return df.reindex(columns=cols)
    


# read in a directory that contains Triager csvs 
# split each Triager csv to 4 other csvs based on topic_name (argument, reasoning, etc)
# and write it as csvs in the output directory
def triager_split(input_directory, output_directory):
    # input_directory - e.g. "Triager data"
    # output_directory - e.g. "Triager output"
    for file in os.listdir(input_directory):
        triager_split_help(input_directory, file, output_directory)
    print("Triager Tranformation Done! Ready to be imported to uAlpha!")

In [10]:
# resolve overlapping issues by merging overlapping rows!!!
def resolve_overlapping(df):
    # df - a dataframe with 5 cols {0: index, 1: user_id, 2:topic_name(as int), 3:blank, 4:start_pos, 5:end_pos}
    # this function only works when start_pos are sorted for each user!!!
    created = df["created"].unique()
    # iterate through all unique user 
    for time in created:
        # start and end index of that user 
        start_i = df[df["created"] == time].index.values[0]
        end_i = df[df["created"] == time].index.values[-1]
        # iterate through the rows of that user 
        i = start_i
        while i < end_i:
            # if the start_pos of next row > end_pos of this row
            # no overlap
            # 5 - the col of end_pos
            # the 10th column is end_pos_adjusted
            end_pos = df.iat[i, 10]
            # the 9th column is start_pos_adjusted
            start_pos = df.iat[i+1, 9]
            if start_pos > end_pos:
                i += 1 
            else:
                # if the article_number is the same, doesn't really matter 
                # modify the end_pos of this row, merge 
                if df.iat[i, 10] < df.iat[i+1 , 10]:
                    df.iat[i, 10] = df.iat[i+1, 10].copy()
                # drop the next row 
                df.drop(i+1, inplace=True)
                # reindex
                df.reset_index(drop=True, inplace=True)
                # decrement end_i 
                end_i -= 1
    return df

In [11]:
# add adjusted start_pos and adjusted end_pos for df 
def adjust(df):
    # df - a dataframe with ["article_number", "article_text_length", "start_pos", "end_pos"] cols
    article_number = df["article_number"]
    article_text_length = df["article_text_length"]
    start_pos = df["start_pos"]
    end_pos = df["end_pos"]
    # the cumulative sum of article_text_length
    cumulative = []
    start_pos_adjusted = []
    end_pos_adjusted = []
    pre_total = 0
    total = article_text_length[0]
    cur_num = article_number[0]
    for i in np.arange(len(article_number)):
        if article_number[i] == cur_num:
            total = total + 0
            cumulative.append(total)
        else:
            pre_total = total
            cur_num = article_number[i]
            total = total + article_text_length[i]
            cumulative.append(total)
        start_pos_adjusted.append(start_pos[i] + pre_total)
        end_pos_adjusted.append(end_pos[i] + pre_total)
    df["cumulative"] = cumulative
    df["start_pos_adjusted"] = start_pos_adjusted
    df["end_pos_adjusted"] = end_pos_adjusted
    return df 

In [12]:
# take in a Triager csv and split it to 4 different csvs 
# based on topic_name 
def triager_split_help(input_directory, fileName, output_directory):
    # input_directory - e.g. "Triager data"
    # fileName - e.g. "Covid_Form1.0.adjudicated-2020-10-04T2314-Tags.csv"
    # output_directory - e.g. "Triager output"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name)
    # different topic_names 
    topic_names = df["topic_name"].unique()
    for topic in topic_names: 
        # select all rows for that specific topic name 
        df_topic = df.loc[df["topic_name"] == topic]
        # select only these columns 
        filtered = slice_input(df_topic)
        # slice contributor_uuid with the first 6 chars 
        filtered["contributor_uuid"] = filtered["contributor_uuid"].str[:6]
        # map topic_name to integers
        mapping(filtered, dic)
        # add a blank column 
        filtered.insert(loc=3, column="blank", value="")
        # sort
        filtered = filtered.sort_values(by=["article_number", "contributor_uuid", "start_pos"])
        # re-index such that its index starts from 0 again
        filtered.reset_index(drop=True, inplace=True)
        # add modified columns
        filtered = adjust(filtered)
        # resolve overlapping issues 
        filtered = resolve_overlapping(filtered)
        # filter and select only certain columns 
        output = slice_output(filtered)
        # sort again by contributor
        output = output.sort_values(by=["contributor_uuid", "start_pos_adjusted"])
        # add one column of u + str(row) and set it as index 
        index = 'u' + pd.Series(filtered.index).astype(str)
        output.set_index(keys=index, inplace=True)
        # the batch_name, e.g. "Covid"
        batch_name = re.split(r'_', fileName)[0]
        name = '{0}-Triager-{1}.csv'.format(batch_name, topic)
        path = os.path.join(output_directory, name)
        # write to csvs
        output.to_csv(path, header=False)
    return None

In [13]:
triager_split("Triager data", "Triager output")

Triager Tranformation Done! Ready to be imported to uAlpha!


**Example of resolving overlap**

In [14]:
a = pd.read_csv("example.csv")
a.drop(a.columns[0], axis=1, inplace=True)
a

Unnamed: 0,contributor_uuid,topic_name,start_pos,blank,end_pos,article_text_length,article_number,created,cumulative,start_pos_adjusted,end_pos_adjusted
0,aac18e,2,136,,287,1870,100054,2020-03-18 00:15:42.364545,5755,4021,4172
1,aac18e,2,563,,776,1870,100054,2020-03-18 00:15:42.364545,5755,4448,4661
2,aac18e,2,651,,667,1870,100054,2020-03-18 00:15:42.364545,5755,4536,4552


In [15]:
a = resolve_overlapping(a)
a

Unnamed: 0,contributor_uuid,topic_name,start_pos,blank,end_pos,article_text_length,article_number,created,cumulative,start_pos_adjusted,end_pos_adjusted
0,aac18e,2,136,,287,1870,100054,2020-03-18 00:15:42.364545,5755,4021,4172
1,aac18e,2,563,,776,1870,100054,2020-03-18 00:15:42.364545,5755,4448,4661


In [16]:
# debugging function - find potential overlapping issue 
def detect_overlapping(input_directory, fileName):
    # input directory - e.g. "Triager output"
    # fileName - e.g. "Covid-Triager-Arguments.csv"
    name = os.path.join(input_directory, fileName)
    df = pd.read_csv(name, header=None)
    # all the unique user 
    users = df.iloc[:, 1].unique()
    # iterate through all unique user 
    for user in users:
        # first only select the rows of that user 
        # use copy to deal with the wanring
        piece = df.loc[df[1] == user].copy()
        # sort the 4th column
        piece.sort_values(by=4, inplace=True)
        # find or not
        found = False 
        start_pos = np.array(piece[4])
        end_pos = np.array(piece[5])
        for i in np.arange(1, len(start_pos)):
            if start_pos[i] > end_pos[i-1]:
                continue
            else:
                found = True
                print('user {0}, start_pos {1}, end_pos {2} begin overlapping!'.format(user, start_pos[i], end_pos[i]))
                break 
    return None 

In [17]:
detect_overlapping("Triager output", "Covid-Triager-Arguments.csv")

**Example of removing a row**

In [18]:
a = pd.DataFrame(index=np.array([23, 24, 25, 26, 27]), data={"first": [1, 2, 3, 4, 5]})
a

Unnamed: 0,first
23,1
24,2
25,3
26,4
27,5


In [19]:
a.drop(25, inplace=True)
a

Unnamed: 0,first
23,1
24,2
26,4
27,5
