# AI CUP 2022: Argument Detection
Meng-Chieh, Liu  
2022/11/28

### Note
The columns should contain only q, r and s  

q_length, r_length, is_q

pipeline (for each data):  . 
1. read and get length feature
2. sentencize
3. directly return sentence if count==1
4. extractive long sentence
5. model predict (batch)



function ClickConnect(){
console.log("Working");
document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click();
}
setInterval(ClickConnect,60000)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import

In [None]:
!pip install -q torchtext torch pytorch-lightning
!pip install -q transformers
!pip install -q nltk==3.7
!pip install -q bert-extractive-summarizer

[K     |████████████████████████████████| 798 kB 5.0 MB/s 
[K     |████████████████████████████████| 125 kB 71.6 MB/s 
[K     |████████████████████████████████| 512 kB 65.3 MB/s 
[K     |████████████████████████████████| 87 kB 6.8 MB/s 
[?25h  Building wheel for fire (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 60.7 MB/s 
[K     |████████████████████████████████| 182 kB 69.7 MB/s 
[?25h

In [None]:
# Import all libraries
import pandas as pd
import numpy as np
import re
import pickle
from tqdm import tqdm

# Huggingface transformers
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

import torch
from torch import nn, cuda
from torchmetrics import Accuracy, F1Score
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
punctuations = '''!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~'''

from summarizer import Summarizer


RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Preprocessing

In [None]:
def read_df(df_path):
  df = pd.read_csv(df_path, encoding = "utf-8", index_col='id')[['q','r','s']].applymap(lambda x: x.strip('"'))
  df['q_length'] = df['q'].map(len)
  df['r_length'] = df['r'].map(len)
  return df

In [None]:
def sentencize(sentence, model):

  return [str(sent) for sent in model(sentence).sents]

In [None]:
def bert_summarize(sentence, model, length=2000):
  if len(sentence) > 1000:
    bert_summary = ''.join(model(sentence, num_sentences=10))
    if bert_summary != "":
      return bert_summary
  return sentence

In [None]:
import re
def regex_remove(text):
  text = re.sub("& #? ?[a-zA-Z\d]{2,8} ; ", '', text)
  text = re.sub("-- -- ", '', text)
  return text

### model

In [None]:
class bertDataset (Dataset):
    def __init__(self, df, tokenizer):
        self.tokenizer = tokenizer
        self.q = list(df["q"])
        self.r = list(df["r"])
        self.sentence = list(df["sentence"])
        self.length = len(self.sentence)
        self.features = torch.FloatTensor(np.array(df[['q_length', 'r_length', 'is_q']], dtype=np.float32))
        self.max_len = 512
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, item_idx):
        sentence_q = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.q[item_idx],
            add_special_tokens = True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        sentence_r = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.r[item_idx],
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        return {
            'sentence_q': (sentence_q['input_ids'].flatten(), sentence_q['attention_mask'].flatten(), sentence_q['token_type_ids'].flatten()),
            'sentence_r': (sentence_r['input_ids'].flatten(), sentence_r['attention_mask'].flatten(), sentence_r['token_type_ids'].flatten()),
            'features' : self.features[item_idx]
        }

In [None]:
class bertClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, dropout_rate=0.1):
        super().__init__()

        self.bert1 = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.bert2 = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.fc_task1 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )

        self.fc_task2 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )
        self.criterion = nn.CrossEntropyLoss()


    def forward(self, input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features):
        sentence_q = self.bert1(input_ids=input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1).pooler_output
        sentence_r = self.bert2(input_ids=input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2).pooler_output
        logits = torch.cat([sentence_q, sentence_r, sentence_q*sentence_r, features], 1)
        logits1 = self.fc_task1(logits)
        logits2 = self.fc_task2(logits)
        return logits1, logits2

### predict

In [None]:
def predict(df, model, dataloader):

  with torch.no_grad():

    softmax = nn.Softmax()
    label_predict = torch.Tensor().to(device)

    for i, batch in enumerate(tqdm(dataloader)):
      input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
      input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']
      features = batch['features']

      logits1, _ = model(input_ids1.to(device), attention_mask1.to(device), token_type_ids1.to(device),
                  input_ids2.to(device), attention_mask2.to(device), token_type_ids2.to(device), features.to(device))
      logits1 = softmax(logits1)


      label_predict = torch.concat([label_predict, logits1])


  label_predict_np = label_predict.to('cpu').numpy()
  
  df['label_1'] = label_predict_np[:,1]

  return df

In [None]:
def reformat(df_answer, df_predict, threshold=0.5):


  for id in df_answer.index:

    try:
      data = df_predict.loc[id]

      # q
      q = data[(data['is_q']==1)].reset_index()
      max_value = q["label_1"].max()

      if max_value < threshold:
        index = q[q["label_1"]==max_value].index[0]
        q_predict = q['sentence'][index]
      else:
        q = q[q['label_1']>=threshold]
        q_predict = " ".join(q['sentence'])
      
      df_answer['q'][id] = q_predict


      # r
      r = data[(data['is_q']==0)].reset_index()
      max_value = r["label_1"].max()
      
      if max_value < threshold:
        index = r[r["label_1"]==max_value].index[0]
        r_predict = r['sentence'][index]
      else:
        r = r[r['label_1']>=threshold]
        r_predict = " ".join(r['sentence'])
      df_answer['r'][id] = r_predict

    except:
      print(f'Error in id {id}.')
      pass

  return df_answer

### main func

In [None]:
def main(df_path, model_path, threshold=0.5):
  # load pretrained models
  spacy_sentencizer = spacy.load('en_core_web_sm')
  # bert_summarizer = Summarizer()
  # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  # model = bertClassifier()
  # model = model.load_from_checkpoint(model_path)
  # model.eval()
  # model.to(device)


  # preprocess
  df = read_df(df_path)
  df['q'] = df['q'].map(regex_remove)
  df['r'] = df['r'].map(regex_remove)
  print("except length: {}".format(len(df))) 
  df["q_sentences"] = df['q'].apply(sentencize, args=[spacy_sentencizer])
  df["r_sentences"] = df['r'].apply(sentencize, args=[spacy_sentencizer])
  # df["q_summary"] = df['q'].apply(bert_summarize, args=[bert_summarizer])
  # df["r_summary"] = df['r'].apply(bert_summarize, args=[bert_summarizer])


  # reformat data
  df_answer = pd.DataFrame(index=df.index, columns=['q', 'r'])
  df_loader = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
  for id in df.index:
    # Q
    # if len(df["q_sentences"]) <= 1:
    #   df_answer['q'][id] = df['q'][id]
    # else:
    if len(df["q_sentences"]) >= 1:
      q = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
      
      q['sentence'] = df["q_sentences"][id]
      q['id'] = id
      # q['q'] = df["q_summary"][id]
      # q['r'] = df["r_summary"][id]
      q['is_q'] = 1
      q['q_length'] = df["q_length"][id]
      q['r_length'] = df["r_length"][id]
      df_loader = pd.concat([df_loader, q])

    # R
    # if len(df["r_sentences"]) <= 1:
    #   df_answer['r'][id] = df['r'][id]
    # else:
    if len(df["r_sentences"]) >= 1:
      r = pd.DataFrame(columns=['id','sentence', 'q', 'r', 'is_q', 'q_length', 'r_length'])
      
      r['sentence'] = df["r_sentences"][id]
      r['id'] = id
      # r['q'] = df["q_summary"][id]
      # r['r'] = df["r_summary"][id]
      r['is_q'] = 0
      r['q_length'] = df["q_length"][id]
      r['r_length'] = df["r_length"][id]
      df_loader = pd.concat([df_loader, r])

  df_loader = df_loader.set_index('id')  
  return df_loader

  # model predict
  dataset = bertDataset(df=df_loader, tokenizer=tokenizer)
  dataloader = DataLoader(dataset, batch_size=16, num_workers=2)
  df_predict = predict(df_loader, model, dataloader)

  # create answer
  df_answer = reformat(df_answer, df_predict, threshold)
  print("output length: {}".format(len(df_answer))) 

  return df_answer

### main

In [None]:
import pickle

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_v3/reformat_df.pickle', 'rb') as f:
    reformat_df = pickle.load(f)

In [None]:
res_1 = reformat_df.copy()
res_1['tokens'] = res_1['sentence'].map(bert_token_length)

In [None]:
res_1['tokens'].quantile([i/100 for i in range(0,100,1)])

0.00     3.0
0.01     4.0
0.02     4.0
0.03     4.0
0.04     5.0
        ... 
0.95    47.0
0.96    50.0
0.97    53.0
0.98    58.0
0.99    68.0
Name: tokens, Length: 100, dtype: float64

In [None]:
x = res_1[res_1['tokens']>100]

In [None]:
df_path="/content/drive/Shareddrives/AI_CUP_NLP/Batch_answers - test_data(no_label).csv"
model_path="/content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_4/checkpoints/epoch=11-val_loss=1.45.ckpt"

In [None]:
df_test = main(df_path, model_path, 0.2)

except length: 2016


In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def bert_token_length(text):
  tokens = tokenizer.encode_plus(text, add_special_tokens=True)
  return len(tokens['input_ids'])

In [None]:
res = df_test.copy()
res['tokens'] = res['sentence'].map(bert_token_length)

In [None]:
res['tokens'].quantile([i/100 for i in range(0,100,1)])

0.00     3.0
0.01     4.0
0.02     4.0
0.03     4.0
0.04     5.0
        ... 
0.95    46.0
0.96    49.0
0.97    52.0
0.98    57.0
0.99    66.0
Name: tokens, Length: 100, dtype: float64

In [None]:
x = iter(res[res['tokens']>66]['sentence'].values)

In [None]:
for i, j in enumerate(x):
  print(i)
  print(j)
  print('------------------------------')

In [None]:
df_answer = main(df_path, model_path, 0.2)

In [None]:
df_answer = df_answer.applymap(regex_remove)

In [None]:
df_answer.to_csv('/content/drive/Shareddrives/AI_CUP_NLP/answer/model_v3.5.csv')

### other

In [None]:
df = read_df(df_path)