In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from tqdm import tqdm

In [2]:
PUNCATUATION = '''!\"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'''
MAX_LEN = 500

In [3]:
def preProcess(sent: str):
    sent = sent.strip('\"')
    sent = word_tokenize(sent)
    for i in PUNCATUATION:
        if i in sent:
            sent = list(filter(i.__ne__, sent))
    if len(sent) == 0:
        sent.append(' ')
    return ' '.join(sent)

def compare_token(sent_a_list, sent_b_list):
    result_list = []
    for sent_a, sent_b in zip(sent_a_list, sent_b_list):
        compare_result = []
        a_tokens = sent_a.split(sep=' ')
        b_tokens = sent_b.split(sep=' ')
        a_idx = 0
        b_idx = 0
        while a_idx < len(a_tokens):
            if b_idx == len(b_tokens):
                a_idx += 1
                compare_result.append(0)
                continue
            if a_tokens[a_idx] != b_tokens[b_idx]:
                a_idx += 1
                compare_result.append(0)
            elif a_tokens[a_idx] == b_tokens[b_idx]:
                a_idx += 1
                b_idx += 1
                compare_result.append(1)
        assert len(a_tokens) == len(compare_result), "ERROR, Len of tokens not equal to len of result!"
        result_list.append(compare_result)
    return result_list
    

In [4]:
BERT_BASE_CASED = 'bert-base-cased'

In [5]:
dataFrame = pd.read_csv('./Batch_answers - train_data (no-blank).csv')
dataFrame.drop(labels=['Unnamed: 6', 'total no.: 7987'], axis=1 ,inplace=True) # 將不必要的drop掉
dataFrame.rename({"q\'":"Q", "r\'":"R"}, axis=1, inplace=True)

In [6]:
dataFrame.head(6) # 看頭六個row 來確認資料屬性

Unnamed: 0,id,q,r,s,Q,R
0,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""It can go both ways . We all doubt . It is wh...","""True ."""
1,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""can go both ways . We all doubt . It is what ...","""True"""
2,8,"""It can go both ways . We all doubt . It is wh...","""True .""",AGREE,"""It can go both ways . We all doubt . It is wh...","""True"""
3,9,"""once again , you seem to support the killing ...","""based on the idea that people are dispensible...",AGREE,"""seem to support the killing of certain people""","""based on the idea that people are dispensible..."
4,9,"""once again , you seem to support the killing ...","""based on the idea that people are dispensible...",AGREE,"""you seem to support the killing of certain pe...","""based on the idea that people are dispensible"""
5,9,"""once again , you seem to support the killing ...","""based on the idea that people are dispensible...",AGREE,"""you seem to support the killing of certain pe...","""based on the idea that people are dispensible"""


In [7]:
print("Preprocessing q ...")
dataFrame['q'] = dataFrame['q'].map(preProcess)
print("Preprocessing r ...")
dataFrame['r'] = dataFrame['r'].map(preProcess)
print("Preprocessing Q ...")
dataFrame['Q'] = dataFrame['Q'].map(preProcess)
print("Preprocessing R ...")
dataFrame['R'] = dataFrame['R'].map(preProcess)

Preprocessing q ...
Preprocessing r ...
Preprocessing Q ...
Preprocessing R ...


In [9]:
dataFrame.sample(5)

Unnamed: 0,id,q,r,s,Q,R
3577,930,5 Show me one instance of an evolutionist any ...,Apart from the 'horizontally instead of vertic...,DISAGREE,Show me one instance of an evolutionist any bo...,We may one day encounter evidence that complet...
3494,909,sinjin No comment,Good this is n't the time for either side to b...,AGREE,sinjin No comment,Good this is n't the time
1874,507,one other point and i brought this up lots of ...,I think the christians you describe do not dis...,AGREE,that is those that disregard the genesis story...,I think the christians you describe do not dis...
29895,7830,Gun sales are booming nationwide Â— and in Eri...,Unfortunately Democrats have a credibility pro...,AGREE,Gun sales are booming nationwide Â— and in Eri...,However Ms. Feinstein 's comments about waitin...
10020,2704,I still find it ironic that so many people act...,Well one has to use something to make the Cons...,AGREE,I still find it ironic that so many people act...,one has to use something to make the Constitut...


In [10]:
uniqueDatafrme = []
for id in tqdm(dataFrame['id'].unique()):
    q = dataFrame.query("@id == id")
    uniqueDatafrme.append(q.iloc[0])
uniqueDatafrme = pd.DataFrame(uniqueDatafrme)

In [13]:
uniqueDatafrme.iloc[0]

id                                                    8
q     It can go both ways We all doubt It is what yo...
r                                                  True
s                                                 AGREE
Q     It can go both ways We all doubt It is what yo...
R                                                  True
Name: 0, dtype: object

In [14]:
uniqueDatafrme['com_q'] = compare_token(uniqueDatafrme['q'], uniqueDatafrme['Q'])
uniqueDatafrme['com_r'] = compare_token(uniqueDatafrme['r'], uniqueDatafrme['R'])

In [15]:
uniqueDatafrme['s'].value_counts()

DISAGREE    6542
AGREE       1445
Name: s, dtype: int64

In [16]:
uniqueDatafrme['q_len'] = uniqueDatafrme['q'].map(lambda x : len(x.split()))
uniqueDatafrme['r_len'] = uniqueDatafrme['r'].map(lambda x : len(x.split()))

In [17]:
uniqueDatafrme[uniqueDatafrme['q_len'] > MAX_LEN]

Unnamed: 0,id,q,r,s,Q,R,com_q,com_r,q_len,r_len
295,112,As for who plays 'word-games it seems that you...,No I for one dislike your posts because I know...,DISAGREE,Mach is happy to admit he does n't read my pos...,No I for one dislike your posts because I know...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",508,144
518,152,This post contains a perfectly simple explanat...,So what this guy is saying is that he believes...,DISAGREE,This premise says that the fundamental unit of...,guy is saying is that he believes in natural s...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",718,16
2035,546,O.k lets go through your scenario The governme...,So in your opinion are you saying that this in...,DISAGREE,But each of the 400 million people has a gun w...,citizens `` would be a major factor in decidin...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1591,140
2906,768,Gun Lobby Promoted `` Final War `` Against Fed...,Oh boy is this what the Anti-gun crowd has com...,DISAGREE,federal Bureau of Alcohol Tobacco and Firearms...,I think you have just lost about all credibili...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",627,29
4001,1058,Washington CNN -- Nearly 180 Department of Hom...,Well that 's a real holy XXXX moment right the...,DISAGREE,The officers with Customs and Border Patrol an...,need to shut the hell up and look at the polic...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",639,51
...,...,...,...,...,...,...,...,...,...,...
32784,8598,SCHUMER OK. Now let me ask you this When you w...,Schumer is a bonafide XXXXX as are many in the...,DISAGREE,Well what you have pointed out is exactly why ...,He failingly tried to draw Alito into exposing...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",751,64
33439,8765,So you think there is a God Bad design does no...,You are assuming that the TOE of evolution has...,DISAGREE,you think there is a God Bad design does not e...,You are assuming that the TOE of evolution has...,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",936,55
33748,8846,President Obama poses a real and present dange...,Anyone who votes for Kagan might as well tende...,DISAGREE,`` The most recent pick Elena Kagan ran much o...,Anyone who votes for Kagan might as well tende...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",571,11
37958,9928,GEOLOGY The Calibration of Ediacaran Time Alan...,So any creationist want to tell us why these p...,DISAGREE,Calibration of Ediacaran evolutionary progress...,any creationist want to tell us why radiometri...,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",1229,97


In [18]:
uniqueDatafrme[uniqueDatafrme['r_len'] > MAX_LEN]

Unnamed: 0,id,q,r,s,Q,R,com_q,com_r,q_len,r_len
1938,525,Not sure what you mean Natural selection is on...,I think that Believer may be referring to micr...,DISAGREE,Natural selection is one of several mechanisms...,natural selection does not add information to ...,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",66,632
2650,702,Hello everyone I am a high school student seek...,Well to keep it short IÂ ’ ll cite you to what...,AGREE,Can you all just tell me if you are for guns o...,And I can not see why arms should be denied to...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",51,1215
3097,812,I think Maximus owes us an apology http //www....,Well it is interesting to not that two of the ...,AGREE,I think Maximus owes us an apology http //www....,A French study in the April 2005 British Journ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,507
3810,999,If you personally choose to live your life in ...,It 's amazing how little you godless heathens ...,DISAGREE,bible Religious influence in government is hig...,godless heathens,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",36,1473
4168,1098,Let 's not deal with assumptions Monty althoug...,Oh WJA what a neat little trick The claim had ...,DISAGREE,Cite precisely where Dembski has published an ...,I said Dembski had come up with a filter The a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",52,779
6202,1643,How likely is that Kansas will succeed and als...,YEC really si n't an issue here Here is a link...,DISAGREE,How likely is that Kansas will succeed and als...,A large part of th theory is conjecture Teach ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,837
6286,1658,Thanks for your input,Start with the largest most powerful calibur y...,AGREE,Thanks for your input,cost increase with larger calibur,"[1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,638
6844,1826,BS Where is the raw data ... The link you show...,There called footnotes The GOA is peppered wit...,DISAGREE,Where is the raw data ...,Most people ca n't be trusted so we should hav...,"[0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",40,1256
6910,1841,One of the quotes in your signature is- `` The...,Lacking substance as usual .... Whether Jeffer...,DISAGREE,One of the quotes in your signature is- `` The...,Lacking substance as usual .... Whether Jeffer...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",54,1264
8956,2370,I 'm serious I want a legitimate debate once a...,You get one post Homosexual sin No harm A numb...,DISAGREE,Why is homosexuality wrong and how does it aff...,Homosexual sin No harm A number of reasons Fir...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",122,826


In [19]:
uniqueDatafrme.drop(index=uniqueDatafrme[uniqueDatafrme['q_len'] > MAX_LEN].index, inplace=True)
uniqueDatafrme.drop(index=uniqueDatafrme[uniqueDatafrme['r_len'] > MAX_LEN].index, inplace=True)

In [20]:
uniqueDatafrme.drop(labels=['q_len', "r_len"], axis=1, inplace=True)

In [21]:
uniqueDatafrme

Unnamed: 0,id,q,r,s,Q,R,com_q,com_r
0,8,It can go both ways We all doubt It is what yo...,True,AGREE,It can go both ways We all doubt It is what yo...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[1]
3,9,once again you seem to support the killing of ...,based on the idea that people are dispensible ...,AGREE,seem to support the killing of certain people,based on the idea that people are dispensible ...,"[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
7,10,I personly would not condone an abortion howev...,This is a pretty touchy issue and I agree with...,AGREE,not condone an abortion however i would n't co...,I agree with the different points of view .. m...,"[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
11,11,First there is no `` us `` on your part regard...,Ah I see Your reasons are secret reasons On a ...,DISAGREE,no one owes you anything,find that appalling,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15,12,alright how did evolution start,See above,DISAGREE,how evolution start,above,"[0, 1, 0, 1, 1]","[0, 1]"
...,...,...,...,...,...,...,...,...
38336,9999,The ID movements form of ID states that there ...,That of course is the logical fallacy known as...,DISAGREE,Behe is happy to think that it is the christia...,But our ignornace is not evidence of an intell...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
38338,10000,For me it would therefore have made no differe...,It logically follows from the moral foundation...,AGREE,it would therefore have made no difference if ...,It logically follows from the moral foundation...,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
38340,10001,good thing this argument has never been done ....,And teen sex does n't by the very nature of it...,DISAGREE,good thing this argument has never been done ....,And teen sex does n't by the very nature of it...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
38342,10002,I know one thing anything that happens politic...,Was n't sinjin crowing about his plans to take...,DISAGREE,I know one thing anything that happens politic...,Was n't sinjin crowing about his plans to take...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."


In [22]:
uniqueDatafrme.to_csv('PreprocessData2.csv', index=False)