# AI CUP 2022: Argument Detection (Preprocessing)
Meng-Chieh, Liu  
2022/11/28

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import pickle
from tqdm import tqdm

In [None]:
version = 'v8'

## load and filter csv
* For every id, we choose the longest length of q'+r' for training, for the reason that our basic init is "sentence".   

(remember to revise data path)


In [None]:
# load csv
train_path = "/content/drive/Shareddrives/AI_CUP_NLP/Batch_answers - train_data (no-blank).csv"
train_data = pd.read_csv(train_path, encoding = "utf-8", index_col='id').iloc[:,:5].applymap(lambda x: x.strip('"')).reset_index()

In [None]:
train_data["length"] = train_data["q'"] + train_data["r'"]
train_data["length"] = train_data["length"].map(len)

In [None]:
# target sample size
train_data.groupby(by=train_data.id).first().shape[0]

7987

In [None]:
idx = train_data.groupby(by=train_data.id)['length'].transform(max) == train_data['length']
small_train_data = train_data[idx].set_index("id")
small_train_data = small_train_data.groupby(by=small_train_data.index).first()

In [None]:
# check sample size
small_train_data.shape[0]

7987

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
punctuations = '''!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~'''
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def word_count(x):
  return len(word_tokenize(''.join([i for i in x if i not in string.punctuation])))

In [None]:
small_train_data['q_count'] = small_train_data['q'].map(word_count)
small_train_data['r_count'] = small_train_data['r'].map(word_count)

In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/small_train_data.pickle', 'wb') as f:
    pickle.dump(small_train_data, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/small_train_data.pickle', 'rb') as f:
    small_train_data = pickle.load(f)

## Regex and Normalized
* We found that their are some special tokens in texts sucj as html tokens "&amp", so we remove then with regex.
* Normalized length features

(remember to revise path)

In [None]:
import re
from sklearn.preprocessing import StandardScaler

In [None]:
def regex_remove(text):
  text = re.sub("& #? ?[a-zA-Z\d]{2,8} ; ", '', text)
  text = re.sub("-- -- ", '', text)
  return text

In [None]:
def length_scaler(df):
  length_feature = scaler.transform(df[['q_length', 'r_length']])
  df['q_length'] = length_feature[:,0]
  df['r_length'] = length_feature[:,1]
  return df

In [None]:
regex_data = small_train_data.copy()
regex_data['q'] = regex_data["q"].map(regex_remove)
regex_data['r'] = regex_data["r"].map(regex_remove)

In [None]:
regex_data['q_length'] = regex_data['q'].map(len)
regex_data['r_length'] = regex_data['r'].map(len)
regex_data['s'] = regex_data['s'].map(lambda x: 1 if x=="AGREE" else 0)

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(regex_data[['q_length', 'r_length']])

In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/scaler.pickle', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/scaler.pickle', 'rb') as f:
    scaler = pickle.load(f)

In [None]:
regex_data = length_scaler(regex_data)

In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/regex_data.pickle', 'wb') as f:
    pickle.dump(regex_data, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/regex_data.pickle', 'rb') as f:
    regex_data = pickle.load(f)

## Sentencize
* We use spacy to sentencize, and we also split sentences with ";" base on our observation.

(remember to revise path)

In [None]:
import spacy

In [None]:
sentencizer = spacy.load('en_core_web_sm')

In [None]:
def sentencize(sentence):
  sents = []
  for sent in sentencizer(sentence).sents:
    str_sent = str(sent)
    sents.extend(re.split("; ", str_sent))

  return sents

In [None]:
texts = regex_data[['q', 'r']]
texts = texts.applymap(sentencize)

In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/texts.pickle', 'wb') as f:
    pickle.dump(texts, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/texts.pickle', 'rb') as f:
    texts = pickle.load(f)

## Extractive Summarization
* For those q and r that is too long to put into bert model, we use **extractive summarization** to shorten the tests to not more than 450 tokens. The purpose of this step is to maintain the overall information of texts but not find arguments, so we just use bert-extractive-summarizer to do it for us.

(remember to revise path)

In [None]:
!pip install -q bert-extractive-summarizer
!pip install -q transformers

[K     |████████████████████████████████| 5.8 MB 58.9 MB/s 
[K     |████████████████████████████████| 182 kB 77.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 65.9 MB/s 
[?25h

In [None]:
from summarizer import Summarizer
from transformers import BertTokenizer
bert_summarizer = Summarizer()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def bert_token_length(text):
  tokens = tokenizer.encode_plus(text, add_special_tokens=True)
  return len(tokens['input_ids'])

def spacy_sentencize_count(sentence):
  return len([sent for sent in sentencizer(sentence).sents])

In [None]:
def bert_summarize(sentence):

  token_length = bert_token_length(sentence)

  if token_length > 450:
    sentence_count = spacy_sentencize_count(sentence)
    num_sentences = round(sentence_count * 450 / token_length)
    

    bert_summary_1 = bert_summary = ''.join(bert_summarizer(sentence, num_sentences=num_sentences))
    

    while num_sentences>1 and bert_token_length(bert_summary) > 450:
      num_sentences -= 1
      bert_summary = ''.join(bert_summarizer(sentence, num_sentences=num_sentences))
    
    if bert_summary != "":
      return bert_summary
    elif bert_summary_1 != "":
      return bert_summary_1

  return sentence

In [None]:
summary = regex_data[['q', 'r']]
summary = summary.applymap(bert_summarize)

Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/summary.pickle', 'wb') as f:
    pickle.dump(summary, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/summary.pickle', 'rb') as f:
    summary = pickle.load(f)

## Reformat
* We reforamt the data in the unit of sentences

(remember to revise path)

In [None]:
# revised LCS for labeling
def LCS(text1: str, text2: str) -> int:

    text1 = [i for i in word_tokenize(text1) if len(i)>1 or i not in punctuations]
    text2 = [i for i in word_tokenize(text2) if len(i)>1 or i not in punctuations]
    len_text1, len_text2 = len(text1), len(text2)

    if len(text2) > len(text1):
        text1, text2 = text2, text1
    lcs = [[0]*(len(text2)+1) for _ in range(2)]
    for i in range(1, len(text1)+1):
        for j in range(1, len(text2)+1):
            if text1[i-1]== text2[j-1]:
                lcs[i%2][j] = lcs[(i-1) % 2][j-1] +1
            else:
                lcs[i%2][j]= max(lcs[(i-1)%2][j], lcs[i % 2][j-1])
    lcs = lcs[len(text1)% 2][len(text2)]
    score_1 = 0 if len_text1 == 0 else lcs/len_text1

    return score_1

In [None]:
# revised labeling method

reformat_df = pd.DataFrame(columns=['id','sentence', 'is_q', 'label'])

for i in tqdm(regex_data.index):
  q_sentence_count = len(texts["q"][i])
  r_sentence_count = len(texts["r"][i])
  if q_sentence_count == 0 or r_sentence_count == 0:
    continue
  
  ### Q ###

  if regex_data['q_count'][i] > 25:
    temp_df = pd.DataFrame(columns=['id', 'sentence', 'is_q', 'label'])
    temp_df['sentence'] = texts['q'][i]
    temp_df['is_q'] = 1
    temp_df["id"] = i
    
    if q_sentence_count == 1:
      temp_df['label'] = 1
    else:
      label_list = []
      max_score = 0
      max_index = 0
      for j, sentence in enumerate(texts['q'][i]):
        score_1 = LCS(sentence, regex_data["q'"][i])

        if score_1 >= 0.7:    # label good sentences out
          label_list.append(1)
        else:
          label_list.append(0)

        if score_1 > max_score:  # at least label one sentence out
          max_score = score_1
          max_index = j

      label_list[max_index] = 1
      temp_df['label'] = label_list

    reformat_df = pd.concat([reformat_df, temp_df], axis=0)

  ### R ###

  if regex_data['r_count'][i] > 25:
    temp_df = pd.DataFrame(columns=['id', 'sentence', 'is_q', 'label'])
    temp_df['sentence'] = texts['r'][i]
    temp_df['is_q'] = 0
    temp_df["id"] = i

    if r_sentence_count == 1:
      temp_df['label'] = 1

    else:
      label_list = []
      max_score = 0
      max_index = 0
      for j, sentence in enumerate(texts['r'][i]):
        score_1 = LCS(sentence, regex_data["r'"][i])

        if score_1 >= 0.7:    # label good sentences out
          label_list.append(1)
        else:
          label_list.append(0)

        if score_1 > max_score:  # at least label one sentence out
          max_score = score_1
          max_index = j

      label_list[max_index] = 1
      temp_df['label'] = label_list
    
    reformat_df = pd.concat([reformat_df, temp_df], axis=0)
  
reformat_df = reformat_df.set_index('id', drop=True)

100%|██████████| 7987/7987 [01:36<00:00, 82.91it/s]


In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/reformat_df.pickle', 'wb') as f:
    pickle.dump(reformat_df, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/reformat_df.pickle', 'rb') as f:
    reformat_df = pickle.load(f)

In [None]:
reformat_df.head()

Unnamed: 0_level_0,sentence,is_q,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,based on the idea that people are dispensible ...,0,1
9,a woman would abort her baby because being a m...,0,0
9,in the same way we send soldiers to kill the e...,0,0
10,"I personly would not condone an abortion , how...",1,1
10,you ca n't pass your ideas of when life began ...,1,1


## Combine
* Just combine above results

(remember to revise path)

In [None]:
# combine regex_data, summary and new_df
df_1 = regex_data.copy()
df_1['q'] = summary['q']
df_1['r'] = summary['r']
new_df = pd.merge(reformat_df, df_1, how="left", left_index=True, right_index=True)
new_df = new_df[new_df['sentence'].map(lambda x: False if x=='' else True)]

In [None]:
sum(new_df['label'])/len(new_df)

0.3720446584938704

In [None]:
# Save
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/new_df.pickle', 'wb') as f:
    pickle.dump(new_df, f)

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/new_df.pickle', 'rb') as f:
    new_df = pickle.load(f)

In [None]:
new_df.head()

Unnamed: 0_level_0,sentence,is_q,label,q,r,s,q',r',length,q_count,r_count,q_length,r_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9,based on the idea that people are dispensible ...,0,1,"once again , you seem to support the killing o...",based on the idea that people are dispensible ...,1,seem to support the killing of certain people,based on the idea that people are dispensible ...,138,14,58,-0.412499,-0.056123
9,a woman would abort her baby because being a m...,0,0,"once again , you seem to support the killing o...",based on the idea that people are dispensible ...,1,seem to support the killing of certain people,based on the idea that people are dispensible ...,138,14,58,-0.412499,-0.056123
9,in the same way we send soldiers to kill the e...,0,0,"once again , you seem to support the killing o...",based on the idea that people are dispensible ...,1,seem to support the killing of certain people,based on the idea that people are dispensible ...,138,14,58,-0.412499,-0.056123
10,"I personly would not condone an abortion , how...",1,1,"I personly would not condone an abortion , how...","This is a pretty touchy issue , and I agree wi...",1,"personly would not condone an abortion , howev...","This is a pretty touchy issue , and I agree wi...",615,244,293,1.767727,1.592185
10,you ca n't pass your ideas of when life began ...,1,1,"I personly would not condone an abortion , how...","This is a pretty touchy issue , and I agree wi...",1,"personly would not condone an abortion , howev...","This is a pretty touchy issue , and I agree wi...",615,244,293,1.767727,1.592185
