In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from tqdm import tqdm

In [2]:
PUNCATUATION = '''!\"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'''

In [3]:
def preProcess(sent: str):
    sent = sent.strip('\"')
    sent = word_tokenize(sent)
    for i in PUNCATUATION:
        if i in sent:
            sent = list(filter(i.__ne__, sent))
    if len(sent) == 0:
        sent.append(' ')
    return ' '.join(sent)

def compare_token(sent_a_list, sent_b_list):
    result_list = []
    for sent_a, sent_b in zip(sent_a_list, sent_b_list):
        compare_result = []
        a_tokens = sent_a.split(sep=' ')
        b_tokens = sent_b.split(sep=' ')
        a_idx = 0
        b_idx = 0
        while a_idx < len(a_tokens):
            if b_idx == len(b_tokens):
                a_idx += 1
                compare_result.append(0)
                continue
            if a_tokens[a_idx] != b_tokens[b_idx]:
                a_idx += 1
                compare_result.append(0)
            elif a_tokens[a_idx] == b_tokens[b_idx]:
                a_idx += 1
                b_idx += 1
                compare_result.append(1)
        assert len(a_tokens) == len(compare_result), "ERROR, Len of tokens not equal to len of result!"
        result_list.append(compare_result)
    return result_list
    

In [4]:
dataFrame = pd.read_csv('./Batch_answers - train_data (no-blank).csv')
dataFrame.drop(labels=['Unnamed: 6', 'total no.: 7987'], axis=1 ,inplace=True) # 將不必要的drop掉
dataFrame.rename({"q\'":"Q", "r\'":"R"}, axis=1, inplace=True)

In [5]:
dataFrame.sample(6) # 看頭六個row 來確認資料屬性

Unnamed: 0,id,q,r,s,Q,R
16163,4306,"""Lurch/Ungerdunn/Selasphorus -- If you are out...","""It is my understanding that the rules are if ...",DISAGREE,"""Just try not to get banned this time""","""It is my understanding that the rules are if ..."
23220,6158,"""Nice try , but no cigar . It 's been widely r...","""`` According to the Federal Bureau of Investi...",DISAGREE,"""widely reported rabid hatred of 'gun ownershi...","""nation 's overall crime rate dropped 6.4 % , ..."
3646,946,"""It 's established in university administratio...","""Really ? So , university presidents and trust...",DISAGREE,"""It 's established in university administratio...","""I would like see your evidence of university ..."
3360,876,"""You said that the colour changes are caused b...","""And you do n't have any picture . You have no...",DISAGREE,"""I call Â variation within a kindÂ""","""hierarchy of dominence"""
20612,5411,"""A firearm in mearley an inanimate piece of st...","""Wrong . It is a precision machine designed to...",DISAGREE,"""firearm in mearley an inanimate piece of stee...","""Wrong . It is a precision machine designed to..."
14346,3779,"""You do realize that the rules are supposed to...","""yes and my post was proving that""",AGREE,"""You do realize that the rules are supposed to...","""yes and my post was proving that"""


In [6]:
print("Preprocessing q ...")
dataFrame['q'] = dataFrame['q'].map(preProcess)
print("Preprocessing r ...")
dataFrame['r'] = dataFrame['r'].map(preProcess)
print("Preprocessing Q ...")
dataFrame['Q'] = dataFrame['Q'].map(preProcess)
print("Preprocessing R ...")
dataFrame['R'] = dataFrame['R'].map(preProcess)

Preprocessing q ...
Preprocessing r ...
Preprocessing Q ...
Preprocessing R ...


In [7]:
uniqueDatafrme = []
for id in tqdm(dataFrame['id'].unique()):
    q = dataFrame.query("@id == id")
    uniqueDatafrme.append(q.iloc[0])
uniqueDatafrme = pd.DataFrame(uniqueDatafrme)

100%|██████████| 7987/7987 [00:04<00:00, 1696.90it/s]


In [8]:
uniqueDatafrme.iloc[0]

id                                                    8
q     It can go both ways We all doubt It is what yo...
r                                                  True
s                                                 AGREE
Q     It can go both ways We all doubt It is what yo...
R                                                  True
Name: 0, dtype: object

In [9]:
uniqueDatafrme['com_q'] = compare_token(uniqueDatafrme['q'], uniqueDatafrme['Q'])
uniqueDatafrme['com_r'] = compare_token(uniqueDatafrme['r'], uniqueDatafrme['R'])

In [11]:
uniqueDatafrme['s'].value_counts()

DISAGREE    6542
AGREE       1445
Name: s, dtype: int64

In [12]:
uniqueDatafrme.to_csv('LongDocumentPreprocessData.csv', index=False)