# AI CUP 2022: Argument Detection (Predict)
Meng-Chieh, Liu  
2022/11/28

## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q torchtext torch pytorch-lightning
!pip install -q transformers
!pip install -q nltk==3.7
!pip install -q bert-extractive-summarizer

[K     |████████████████████████████████| 798 kB 9.1 MB/s 
[K     |████████████████████████████████| 512 kB 58.2 MB/s 
[K     |████████████████████████████████| 125 kB 79.2 MB/s 
[K     |████████████████████████████████| 87 kB 7.4 MB/s 
[?25h  Building wheel for fire (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 5.8 MB 8.1 MB/s 
[K     |████████████████████████████████| 182 kB 56.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 40.7 MB/s 
[?25h

In [None]:
# general purposes
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

# Huggingface transformers
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

# pytorch
import torch
from torch import nn, cuda
from torch.utils.data import DataLoader,Dataset

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# sklearn
from sklearn.preprocessing import StandardScaler

# for text processing
import re
import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
punctuations = '''!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~'''
from summarizer import Summarizer

# general setup
RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# setup models
bert_summarizer = Summarizer()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentencizer = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Functions

### Preprocessing

In [None]:
def read_df(df_path):
  df = pd.read_csv(df_path, encoding = "utf-8", index_col='id')[['q','r','s']].applymap(lambda x: x.strip('"'))
  df['q_length'] = df['q'].map(len)
  df['r_length'] = df['r'].map(len)
  return df

In [None]:
def sentencize(sentence):

  return [str(sent) for sent in sentencizer(sentence).sents]

In [None]:
def bert_token_length(text):
  tokens = tokenizer.encode_plus(text, add_special_tokens=True)
  return len(tokens['input_ids'])

def spacy_sentencize_count(sentence):
  return len([sent for sent in sentencizer(sentence).sents])

In [None]:
def bert_summarize(sentence):

  token_length = bert_token_length(sentence)

  if token_length > 400:
    sentence_count = spacy_sentencize_count(sentence)
    num_sentences = round(sentence_count * 400 / token_length)
    

    bert_summary_1 = bert_summary = ''.join(bert_summarizer(sentence, num_sentences=num_sentences))
    

    while num_sentences>1 and bert_token_length(bert_summary) > 450:
      num_sentences -= 1
      bert_summary = ''.join(bert_summarizer(sentence, num_sentences=num_sentences))
    
    if bert_summary != "":
      return bert_summary
    elif bert_summary_1 != "":
      return bert_summary_1

  return sentence

In [None]:
import re
def regex_remove(text):
  text = re.sub("& #? ?[a-zA-Z\d]{2,8} ; ", '', text)
  text = re.sub("-- -- ", '', text)
  return text

### model

In [None]:
class bertDataset (Dataset):
    def __init__(self, df, tokenizer):
        self.tokenizer = tokenizer
        self.q = list(df["q"])
        self.r = list(df["r"])
        self.sentence = list(df["sentence"])
        self.length = len(self.sentence)
        self.features = torch.FloatTensor(np.array(df[['q_length', 'r_length', 'is_q']], dtype=np.float32))
        self.max_len = 512
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, item_idx):
        sentence_q = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.q[item_idx],
            add_special_tokens = True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        sentence_r = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.r[item_idx],
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        return {
            'sentence_q': (sentence_q['input_ids'].flatten(), sentence_q['attention_mask'].flatten(), sentence_q['token_type_ids'].flatten()),
            'sentence_r': (sentence_r['input_ids'].flatten(), sentence_r['attention_mask'].flatten(), sentence_r['token_type_ids'].flatten()),
            'features' : self.features[item_idx]
        }

In [None]:
class bertClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, dropout_rate=0.1):
        super().__init__()

        self.bert1 = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.bert2 = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.fc_task1 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )

        self.fc_task2 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )
        self.criterion = nn.CrossEntropyLoss()


    def forward(self, input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features):
        sentence_q = self.bert1(input_ids=input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1).pooler_output
        sentence_r = self.bert2(input_ids=input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2).pooler_output
        logits = torch.cat([sentence_q, sentence_r, sentence_q*sentence_r, features], 1)
        logits1 = self.fc_task1(logits)
        logits2 = self.fc_task2(logits)
        return logits1, logits2

### predict

In [None]:
def predict(df, model, dataloader):

  with torch.no_grad():

    softmax = nn.Softmax()
    label_predict = torch.Tensor().to(device)

    for i, batch in enumerate(tqdm(dataloader)):
      input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
      input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']
      features = batch['features']

      logits1, _ = model(input_ids1.to(device), attention_mask1.to(device), token_type_ids1.to(device),
                  input_ids2.to(device), attention_mask2.to(device), token_type_ids2.to(device), features.to(device))
      logits1 = softmax(logits1)


      label_predict = torch.concat([label_predict, logits1])


  label_predict_np = label_predict.to('cpu').numpy()
  
  df['label_1'] = label_predict_np[:,1]

  return df

In [None]:
def reformat(df_answer, df_predict, threshold=0.5):


  for id in df_answer.index:

    try:
      data = df_predict.loc[id]

      # q
      q = data[(data['is_q']==1)].reset_index()
      max_value = q["label_1"].max()

      if max_value < threshold:
        index = q[q["label_1"]==max_value].index[0]
        q_predict = q['sentence'][index]
      else:
        q = q[q['label_1']>=threshold]
        q_predict = " ".join(q['sentence'])
      
      df_answer['q'][id] = q_predict


      # r
      r = data[(data['is_q']==0)].reset_index()
      max_value = r["label_1"].max()
      
      if max_value < threshold:
        index = r[r["label_1"]==max_value].index[0]
        r_predict = r['sentence'][index]
      else:
        r = r[r['label_1']>=threshold]
        r_predict = " ".join(r['sentence'])
      df_answer['r'][id] = r_predict

    except:
      print(f'Error in id {id}.')
      pass

  return df_answer

### main func

In [None]:
def main(df_path, model_path, threshold=0.5):
  # load pretrained models


  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  model = bertClassifier()
  model = model.load_from_checkpoint(model_path)
  model.eval()
  model.to(device)


  # preprocess
  df = read_df(df_path)
  df['q'] = df['q'].map(regex_remove)
  df['r'] = df['r'].map(regex_remove)
  print("except length: {}".format(len(df))) 
  df["q_sentences"] = df['q'].map(sentencize)
  df["r_sentences"] = df['r'].map(sentencize)
  df["q_summary"] = df['q'].map(bert_summarize)
  df["r_summary"] = df['r'].map(bert_summarize)


  # reformat data
  df_answer = pd.DataFrame(index=df.index, columns=['q', 'r'])
  df_loader = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
  for id in tqdm(df.index):
    # Q
    if len(df["q_sentences"]) <= 1:
      df_answer['q'][id] = df['q'][id]
    else:
      q = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
      
      q['sentence'] = df["q_sentences"][id]
      q['id'] = id
      q['q'] = df["q_summary"][id]
      q['r'] = df["r_summary"][id]
      q['is_q'] = 1
      q['q_length'] = df["q_length"][id]
      q['r_length'] = df["r_length"][id]
      df_loader = pd.concat([df_loader, q])

    # R
    if len(df["r_sentences"]) <= 1:
      df_answer['r'][id] = df['r'][id]
    else:
      r = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
      
      r['sentence'] = df["r_sentences"][id]
      r['id'] = id
      r['q'] = df["q_summary"][id]
      r['r'] = df["r_summary"][id]
      r['is_q'] = 0
      r['q_length'] = df["q_length"][id]
      r['r_length'] = df["r_length"][id]
      df_loader = pd.concat([df_loader, r])

  df_loader = df_loader.set_index('id')  

  # model predict
  dataset = bertDataset(df=df_loader, tokenizer=tokenizer)
  dataloader = DataLoader(dataset, batch_size=16, num_workers=2)
  df_predict = predict(df_loader, model, dataloader)

  # create answer
  df_answer = reformat(df_answer, df_predict, threshold)
  print("output length: {}".format(len(df_answer))) 

  return df_answer

### modifying

In [None]:
# Remove double quotes
def remove_double_quotes(df, col):
  df[col] = df[col].str.slice(1, -1)
  
# Count words in a comment
import string
def word_count(df, col):
  return df[col].apply(lambda x: len(nltk.word_tokenize(''.join([i for i in x if i not in string.punctuation]))))

### formating

In [None]:
def format_result(df):
    # Add double quotes
    df['q'] = '"' + df['q'] + '"'
    df['r'] = '"' + df['r'] + '"'
    # Sort the dataframe by id
    df_sorted = df.sort_values(by = 'id')
    return df_sorted

## main
remember to change path

In [None]:
df_path="/content/drive/Shareddrives/AI_CUP_NLP/Batch_answers - test_data(no_label).csv"
model_path="/content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_3/checkpoints/epoch=13-val_loss=1.761.ckpt"

In [None]:
df_answer = main(df_path, model_path, 0.25)

In [None]:
df_answer.to_csv('/content/drive/Shareddrives/AI_CUP_NLP/answer/model_v8_old.csv')

## Modifying

For data that has less than threshold words in both q and r, we assume it does not need summarization.

In [None]:
threshold = 25

In [None]:
# Raw testing data
test_raw = pd.read_csv('/content/gdrive/MyDrive/AI CUP/dataset/Batch_answers - test_data(no_label).csv')
remove_double_quotes(test_raw, 'q')
remove_double_quotes(test_raw, 'r')
test_raw['q_count'] = word_count(test_raw, 'q')
test_raw['r_count'] = word_count(test_raw, 'r')
test_raw

Unnamed: 0,id,q,r,s,q_count,r_count
0,6199,-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -...,"If so , why do we still have apes , and why ar...",DISAGREE,1,37
1,5807,There 's a lot of discussion there on that iss...,Of course . The makers of Expelled were within...,DISAGREE,34,45
2,8487,`` It 's not helping . The guns these people h...,"Oh , I would wager about like Mexico , about 8...",DISAGREE,60,67
3,1760,Shooting : 3 seriously injured in Auburn shoot...,Pickup strikes group of four youths | Houston ...,AGREE,79,47
4,6228,This is the argument concerning 'choice ' that...,I believe there is a point at which we ( socie...,DISAGREE,173,24
...,...,...,...,...,...,...
2011,9499,You are betraying your belief system .,Yep . ( I 'm assuming that by `` belief system...,AGREE,6,42
2012,4611,"You are in a loud minority , railing against t...",Being in the minority or in the majority is ir...,DISAGREE,13,16
2013,9328,You bet your XXX that 'd make me happy .,"Well , first , I probably would n't bet my XXX...",DISAGREE,9,40
2014,5225,you say `` f * * * the Constitution. ``,and gun nuts say f * * * the children when we ...,DISAGREE,5,18


In [None]:
answer_final = pd.read_csv('/content/gdrive/Shareddrives/AI_CUP_NLP/answer/model_v8_old.csv')
answer_final_q = answer_final[['id', 'q']]
answer_final_r = answer_final[['id', 'r']]

In [None]:
test_raw_q = test_raw[test_raw['q_count'] <= threshold][['id','q']]
test_raw_r = test_raw[test_raw['r_count'] <= threshold][['id','r']]
test_raw_q

Unnamed: 0,id,q
0,6199,-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -...
10,2066,"In principle , there is no difference between ..."
19,4766,At least we 're on the right side of the issue...
27,2514,"So , Clive , everyone that believes that God c..."
30,8485,"Hey pandion , voting according to the rules of..."
...,...,...
2011,9499,You are betraying your belief system .
2012,4611,"You are in a loud minority , railing against t..."
2013,9328,You bet your XXX that 'd make me happy .
2014,5225,you say `` f * * * the Constitution. ``


In [None]:
q_list = test_raw_q['id'].values
r_list = test_raw_r['id'].values

In [None]:
answer_final_q_1 = answer_final_q[~answer_final_q['id'].isin(q_list)]
answer_final_r_1 = answer_final_r[~answer_final_r['id'].isin(r_list)]
answer_final_q_1

Unnamed: 0,id,q
1,5807,But it really does n't matter -- a private scr...
2,8487,Have they attempted to trace handguns recovere...
3,1760,Shooting : 3 seriously injured in Auburn shoot...
4,6228,I think that it is imperative to allow a women...
5,3537,Why are some parts of the bible open to interp...
...,...,...
1985,3666,Who are you to tell me which `` tools `` I need ?
1995,3570,You need n't be far left to support reasonable...
2002,6454,While this baby 21 week baby is living in one ...
2004,4058,Why do some of you guys insist on being rabid ...


In [None]:
answer_q = pd.concat([test_raw_q, answer_final_q_1])
answer_r = pd.concat([test_raw_r, answer_final_r_1])
print(answer_q.shape)
print(answer_r.shape)

(2016, 2)
(2016, 2)


In [None]:
answer = pd.merge(answer_q, answer_r, on="id")
answer

Unnamed: 0,id,q,r
0,6199,-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -...,"If so , why do we still have apes , and why ar..."
1,2066,"In principle , there is no difference between ...",In the same sense that shooting an intruder wh...
2,4766,At least we 're on the right side of the issue...,"More like the wrong side of the issues , ever ..."
3,2514,"So , Clive , everyone that believes that God c...",Anyone who sticks to beliefs that a god create...
4,8485,"Hey pandion , voting according to the rules of...","But what if it is unconstitutional , like Prop..."
...,...,...,...
2011,3666,Who are you to tell me which `` tools `` I need ?,It absolutely astounds me that there are Ameri...
2012,3570,You need n't be far left to support reasonable...,Reasonable gun regulation is an oxymoron like ...
2013,6454,While this baby 21 week baby is living in one ...,"There is nobody living inside of the baby , it..."
2014,4058,Why do some of you guys insist on being rabid ...,oh because for the past decade or so they have...


## Format Answers as Required


In [None]:
format_answer = format_result(answer)
format_answer

Unnamed: 0,id,q,r
541,1,"""I got a good idea . however , they do tend to...","""By your own admission you havenÂ ’ t 'hung ou..."
255,2,"""Be sure to give your guns a big fat kiss toni...","""Actually , they did n't ."""
1720,3,"""One of the biggest arguments against gun cont...","""To be more correct regarding government and g..."
245,4,"""First of all , compare the ' B ' specimen in ...","""At your service : Comparison I could 've just..."
608,5,"""There are some incedents that are beyond your...","""Well yes ."""
...,...,...,...
301,9852,"""But what happens if a man marries his first c...","""Darwin did marry his first cousin and his off..."
1726,9853,"""Guidance by humans in the blind Watchmaker do...","""There are many who do n't deny evolution , th..."
484,9854,"""The funny thing is that you 'll know there 'l...","""I dunno , we have 365 days to find out ... th..."
220,9855,"""I think it is only logical as an atheist to s...","""But did you consider it relative when you pos..."


In [None]:
# Save the results
format_answer.to_csv('/content/gdrive/Shareddrives/AI_CUP_NLP/answer/model_v8_old_threshold25.csv', index=False)