### Load librararies

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 23.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  

In [None]:
import os
import pickle 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm import tqdm
import math
from torch.utils.data import Dataset, DataLoader
import json
import random
pd.set_option('display.max_colwidth', -1)

  # This is added back by InteractiveShellApp.init_path()


## CoLA Cleaning

#### Load data

In [None]:
# Read PARANMT

with open('/content/paranmt_dev.pickle', 'rb') as handle:
    paranmt_dev = pickle.load(handle)
with open('/content/paranmt_train.pickle', 'rb') as handle:
    paranmt_train = pickle.load(handle)

df = pd.DataFrame([s[3:5] for s in paranmt_train], columns=["t1","t2"])
df = df.dropna()

In [None]:
# Read GYAFC
paths_tt = ["/content/GYAFC_Corpus/Entertainment_Music/test",
         "/content/GYAFC_Corpus/Entertainment_Music/tune",
         "/content/GYAFC_Corpus/Family_Relationships/test",
         "/content/GYAFC_Corpus/Family_Relationships/tune"
         ]

def read_tt_data(paths_tt):
  df_all = pd.DataFrame()
  for p in paths_tt:
    files = os.listdir(p)
    for file in files:
      if file == "formal":
        file_p = p+"/"+file
        with open(file_p) as f:
          formal_main = f.read().splitlines()
      elif file == "informal":
        file_p = p+"/"+file
        with open(file_p) as f:
          informal_main = f.read().splitlines()
      else: continue

    for file in files:
      if file == "formal" or file == "informal": continue
      file_p = p+"/"+file
      with open(file_p) as f:
        contents = f.read().splitlines() 
      if "informal" in file:
        df = pd.DataFrame({"formal": formal_main, "informal": contents, "file": file, "col":"right"})
      else:
        df = pd.DataFrame({"formal": contents, "informal": informal_main, "file": file, "col":"left"})
      df_all = pd.concat([df_all, df])
  return df_all


df_gyafc = read_tt_data(paths_tt)

In [None]:
# Read GYAFC train
paths = ["/content/GYAFC_Corpus/Entertainment_Music/train/formal",
         "/content/GYAFC_Corpus/Entertainment_Music/train/informal",
         "/content/GYAFC_Corpus/Family_Relationships/train/formal",
         "/content/GYAFC_Corpus/Family_Relationships/train/informal"
         ]

formal_sents = []
informal_sents = []
for p in paths:
  with open(p) as f:
    content = f.read().splitlines()
  if p.split("/")[-1] =="formal":
    formal_sents.extend(content)
  else:
    informal_sents.extend(content)
  
train_gyafc = pd.DataFrame(zip(formal_sents, informal_sents), columns = ["formal", "informal"])

In [None]:
# Read ACL ARC -> original files without paraphrase
with open('/content/acl_testing.jsonl', 'r') as json_file:
    acl_testing = list(json_file)

with open('/content/acl_training.jsonl', 'r') as json_file:
    act_training = list(json_file)

acl_testing_sents = [json.loads(json_str)["cur_sent"] for json_str in acl_testing]
acl_training_sents = [json.loads(json_str)["cur_sent"] for json_str in act_training]

In [None]:
# Read ACL ARC pairs -> include paraphrases from GPT2
with open('/content/acl_testing_pair.pkl', 'rb') as handle:
    acl_test_pair = pickle.load(handle)

with open('/content/acl_train_pair.pkl', 'rb') as handle:
    acl_train_pair = pickle.load(handle)


In [None]:
# Generated by M1 reward
with open('/content/acl_train_pair_m1_reward.pkl', 'rb') as handle:
    acl_train_pair_m1_reward = pickle.load(handle)


In [None]:
class CustomDataset(Dataset):
  def __init__(self, token_ids, attention_masks):
    self.token_ids = token_ids
    self.attention_masks = attention_masks
    if len(self.token_ids) != len(self.attention_masks):
      raise Exception("The length of X does not match the length of Y")

  def __len__(self):
    return len(self.token_ids)

  def __getitem__(self, index):
    _token_ids = self.token_ids[index]
    _attention_masks = self.attention_masks[index]
    return _token_ids, _attention_masks

#### Load model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/roberta-large-cola-krishna2020")
model = AutoModelForSequenceClassification.from_pretrained("cointegrated/roberta-large-cola-krishna2020")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)
model.eval()

### Testing

In [None]:
random.seed(a=2, version=2)
sampled_sents = random.choices(acl_training_sents, k=200_000)

In [None]:
probs = get_conf(acl_train_pair_m1_reward["output_no_punct"].to_list())
fluent_conf = [p[0] for p in probs]
acl_train_pair_m1_reward["output_no_punct_fluency"] = fluent_conf

100%|██████████| 611/611 [10:49<00:00,  1.06s/it]


In [None]:
probs = get_conf(acl_test_pair.output.to_list())
fluent_conf = [p[0] for p in probs]
acl_test_pair["output_fluency"] = fluent_conf

In [None]:
df_acl = pd.DataFrame(sampled_sents, columns = ["text"])
fluent_conf = [p[0] for p in probs]
df_acl["fluent_conf"] = fluent_conf

In [None]:
cleaned_acl_train = df_acl[df_acl.fluent_conf > 0.9]

In [None]:
with open('cleaned_acl_train.pkl', 'wb') as handle:
    pickle.dump(cleaned_acl_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Run inference on batched dataset

In [None]:
def get_conf(texts):
  batch_size =  128
  num_batches = math.ceil(len(texts) / batch_size)
  tokenizer_res = tokenizer(texts, padding = True, return_attention_mask=True, return_tensors="pt", max_length=50, truncation=True).to(device)
  loader = iter(DataLoader(CustomDataset(tokenizer_res["input_ids"], tokenizer_res["attention_mask"]), batch_size=batch_size, shuffle=False))

  all_probs = []
  for i in tqdm(range(num_batches)):
    torch.cuda.empty_cache()
    token_ids, attention_masks = loader.next()
    with torch.no_grad():
      outputs = model(token_ids, attention_masks)
    probs = outputs.logits.softmax(dim = -1).tolist()
    all_probs.extend(probs)
  return all_probs

In [None]:
formal_probs = get_conf(df_gyafc.formal.to_list())
probs_fluent_formal = [probs[0] for probs in formal_probs]
df_gyafc["formal_fluent"] = probs_fluent_formal

100%|██████████| 473/473 [04:49<00:00,  1.63it/s]


In [None]:
informal_probs = get_conf(df_gyafc.informal.to_list())
probs_fluent_informal = [probs[0] for probs in informal_probs]
df_gyafc["informal_fluent"] = probs_fluent_informal

100%|██████████| 473/473 [04:49<00:00,  1.63it/s]


In [None]:
cleaned_gyafc = df_gyafc[(df_gyafc.formal_fluent > 0.9) & (df_gyafc.informal_fluent > 0.9)]
df_gyafc = df_gyafc.drop(columns= ["file","col"])

In [None]:
all_probs = []
for i in tqdm(range(num_batches)):
  torch.cuda.empty_cache()
  token_ids, attention_masks = loader.next()
  outputs = model(token_ids, attention_masks)
  probs = outputs.logits.softmax(dim = -1).tolist()
  all_probs.extend(probs)

100%|██████████| 1142/1142 [08:53<00:00,  2.14it/s]


## Clean BLEU

### Load data & libraries

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
with open('/content/cleaned_gyafc.pkl', 'rb') as handle:
    cleaned_gyafc = pickle.load(handle)

with open('/content/cleaned_paranmt.pkl', 'rb') as handle:
    cleaned_paranmt = pickle.load(handle)

Downloading...
From: https://drive.google.com/uc?id=1PcJLgQU-OLpU3DDKM1GIkeoFO23-8v1b
To: /content/cleaned_paranmt.pkl
100% 5.15M/5.15M [00:00<00:00, 154MB/s]
Downloading...
From: https://drive.google.com/uc?id=1TUniyyAElB3CM4aGC94HmYTgQK66cQ4T
To: /content/cleaned_gyafc.pkl
100% 3.36M/3.36M [00:00<00:00, 94.7MB/s]


### Bleu filtering

In [None]:
def get_bleu(sents1, sents2):
  all_scores = []
  for s1, s2 in tqdm(zip(sents1, sents2)):
    s1_token = nltk.word_tokenize(s1.lower())
    s2_token = nltk.word_tokenize(s2.lower())
    score = nltk.translate.bleu_score.sentence_bleu([s1_token], s2_token)
    all_scores.append(score)
  return all_scores


In [None]:
sents1 = cleaned_gyafc.formal.to_list()
sents2 = cleaned_gyafc.informal.to_list()
scores = get_bleu(sents1,sents2)
cleaned_gyafc["bleu_score"] = scores

sents1 = cleaned_paranmt.t1.to_list()
sents2 = cleaned_paranmt.t2.to_list()
scores = get_bleu(sents1,sents2)
cleaned_paranmt["bleu_score"] = scores

sents1 = train_gyafc.formal.to_list()
sents2 = train_gyafc.informal.to_list()
scores = get_bleu(sents1,sents2)
train_gyafc["bleu_score"] = scores

In [None]:
sents1 = train_gyafc.formal.to_list()
sents2 = train_gyafc.informal.to_list()
scores = get_bleu(sents1,sents2)
train_gyafc["bleu_score"] = scores

In [None]:
sents1 = acl_train_pair.input.to_list()
sents2 = acl_train_pair.output.to_list()
scores = get_bleu(sents1,sents2)
acl_train_pair["bleu_score"] = scores

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
78148it [00:37, 2083.46it/s]


In [None]:
acl_train_pair_bleu = acl_train_pair[acl_train_pair.bleu_score < 0.9]

In [None]:
sents1 = acl_test_pair.input.to_list()
sents2 = acl_test_pair.output.to_list()
scores = get_bleu(sents1,sents2)
acl_test_pair["bleu_score"] = scores

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
48093it [00:23, 2061.32it/s]


In [None]:
acl_test_pair_bleu = acl_test_pair[acl_test_pair.bleu_score < 0.9]

In [None]:
all_acl_cleaned = pd.concat([acl_test_pair_bleu,acl_train_pair_bleu])

In [None]:
all_acl_cleaned = all_acl_cleaned[(all_acl_cleaned.output_fluency > 0.6)]

In [None]:
all_acl_cleaned[all_acl_cleaned.output_fluency > 0.9]

In [None]:
bleu_cleaned_train_gyafc = train_gyafc[(train_gyafc.informal_fluent > 0.9) & (train_gyafc.formal_fluent > 0.9)& (train_gyafc.bleu_score < 0.9)]

In [None]:
bleu_cleaned_paranmt = cleaned_paranmt[cleaned_paranmt.bleu_score < 0.9]

In [None]:
bleu_cleaned_gyafc = cleaned_gyafc[cleaned_gyafc.bleu_score < 0.9]