In [None]:
! pip install gdown

In [None]:
import gdown 
url = 'https://drive.google.com/uc?export=download&id=1zFIVT5wKEmwiasBafjDlqdZF92Gi1blA' 
output = 'GBV.zip'
gdown.download(url, output)

In [None]:
! unzip  GBV.zip

In [None]:
import pandas as pd
import numpy as np

import random
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch 
import warnings
from tqdm import tqdm_notebook

warnings.filterwarnings("ignore")

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)


In [None]:
df = pd.read_csv("Train.csv")

In [None]:
test_df = pd.read_csv("Test.csv")

In [None]:
idtoclass = df.type.unique()
classtoid = {idtoclass[i]:i for i in range(len(idtoclass))}
print(idtoclass)
print(classtoid)

In [None]:
df["type"] = df["type"].apply(lambda x:classtoid[x])

In [None]:
classcount = {i:len(df[df.type==i]) for i in range(len(idtoclass))}
print(classcount)

In [None]:
df.head()

In [None]:
import nltk
from nltk import word_tokenize
import string
nltk.download("punkt")
def remove_punct(s):
  s = list(s.split(" "))
  s = " ".join(s)
  
  s = [s for s in word_tokenize(s) if s not in string.punctuation]
  s = " ".join(s)
  return s
  

## Removing punctuation

In [None]:
df["tweet"] = df["tweet"].apply(remove_punct)
test_df["tweet"] = test_df["tweet"].apply(remove_punct)

## Lowering the data

In [None]:
test_df["tweet"] = test_df["tweet"].apply(lambda x: x.lower())
df["tweet"] = df["tweet"].apply(lambda x: x.lower())

In [None]:
df.head()

In [None]:
test_df.to_csv("test_data.csv")

## Augmentation with synonym replacement 

In [None]:
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet

AUGMENTATION_PROB = 0.3
def getsynonym(word):
  if  random.random() > AUGMENTATION_PROB : 
    return word
  syns = wordnet.synsets(word)
  synonyms = set()
  synonyms.add(word)
  for syn in syns : 
    for name in syn.lemma_names():
      synonyms.add(name)

  synonyms = list(synonyms)
  return random.choice(synonyms)

def augment(s):
  s = list(s.split())
  s = [getsynonym(x) for x in s]

  s = " ".join(s)
  return s  


In [None]:
Min_SAMPLES  = 30000
df_augmented = df.copy().drop("Tweet_ID", axis=1)
for i in range(len(idtoclass)):
    texts = list(df[df.type==i].tweet)
    missing = Min_SAMPLES - classcount[i]
    for _ in range(missing):
        txt = random.choice(texts)
        txt = augment(txt)
        d = {"tweet":txt, "type":i }
        df_augmented = df_augmented.append(d, ignore_index=True)

In [None]:
df_augmented.to_csv("synoym_30000.csv", index=False)

## Augmentation using transformers Based on this blog : https://towardsdatascience.com/nlp-data-augmentation-using-transformers-89a44a993bab

In [None]:
class  Augment:
    def __init__(self, augmentations, probs=None):
        self.augmentations = augmentations
        if probs is None : 
            self.probs = np.ones((len(augmentations)))/len(augmentations)
        else : 
            self.probs = probs
    
    def augment(self,text): 
        augmentation = np.random.choice(self.augmentations, p=probs)
        return augmentation.augment(text)

In [None]:
class Translation : 
    def __init__(self):
        
        #English to German using the Pipeline and T5
        self.translator_en_to_de = pipeline("translation_en_to_de", model='t5-base',device=0)

        #Germal to English using Bert2Bert model
        self.tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>")
        self.model_de_to_en = AutoModelForSeq2SeqLM.from_pretrained("google/bert2bert_L-24_wmt_de_en")
        
    def augment(self, text):
        en_to_de_output = self.translator_en_to_de(text)
        translated_text = en_to_de_output[0]['translation_text']
        
        input_ids = self.tokenizer(translated_text, return_tensors="pt", add_special_tokens=False).input_ids
        output_ids = self.model_de_to_en.generate(input_ids)[0]
        augmented_text = self.tokenizer.decode(output_ids, skip_special_tokens=True)
        return augmented_text        

In [None]:
class Insertion : 
    def __init__(self):
        
        self.unmasker = pipeline('fill-mask', model='bert-base-cased',device=0)
        #I went to see a new movie in the theater
        
    def augment(self, text):
        orig_text_list = text.split()
        len_input = len(orig_text_list)
        #Random index where we want to insert the word except at the start or end
        rand_idx = random.randint(1,len_input-2)

        new_text_list = orig_text_list[:rand_idx] + ['[MASK]'] + orig_text_list[rand_idx:]
        new_mask_sent = ' '.join(new_text_list)
        
        #I went to see a [Mask] movie in the theater

        augmented_text_list = self.unmasker(new_mask_sent)
        augmented_text = augmented_text_list[0]['sequence']
        return augmented_text        

In [None]:
class Replacement : 
    def __init__(self):
        
        self.unmasker = pipeline('fill-mask', model='bert-base-cased',device=0)
        #I went to see a new movie in the theater
        
    def augment(self, text):
        orig_text_list = text.split()
        len_input = len(orig_text_list)
        #Random index where we want to replace the word 
        rand_idx = random.randint(1,len_input-1)
        orig_word = orig_text_list[rand_idx]
        new_text_list = orig_text_list.copy()
        new_text_list[rand_idx] = '[MASK]'
        new_mask_sent = ' '.join(new_text_list)
        #I went to [MASK] a movie in the theater
        augmented_text_list = self.unmasker(new_mask_sent)
        #To ensure new word and old word are not name
        for res in augmented_text_list:
          if res['token_str'] != orig_word:
            augmented_text = res['sequence']
            break
        #I went to watch a movie in the theater
        return augmented_text   

In [None]:
class Generation : 
    def __init__(self):
        
        self.generator =  pipeline('text-generation', model='gpt2', device=1)
        #I went to see a new movie in the theater
        
    def augment(self, text):
        input_length = len(text.split())
        num_new_words = 5
        output_length = input_length + num_new_words
        gpt_output = self.generator(text, max_length=output_length, num_return_sequences=5)
        augmented_text = gpt_output[0]['generated_text']
        #I went to see a movie in the theater, and the director was
        return augmented_text 

In [None]:
augmentations = [Replacement()] #[Translation(), Insertion(), Replacement()]
probs = [1.0]#[0.40,0.30,0.30]

In [None]:
augmenter = Augment(augmentations, probs)

In [None]:
text = "I'am here and i want to go the cinema"

for i in range(20):
    print(augmenter.augment(text))

In [None]:
Min_SAMPLES  = 30000
df_augmented = df.copy().drop("Tweet_ID", axis=1)
for i in range(len(idtoclass)):
    texts = list(df[df.type==i].tweet)
    missing = Min_SAMPLES - classcount[i]
    print(f"[Class {i}]")
    for _ in tqdm_notebook(range(missing)):
        txt = random.choice(texts)
        txt = augmenter.augment(txt)
        d = {"tweet":txt, "type":i }
        df_augmented = df_augmented.append(d, ignore_index=True)

In [None]:
df_augmented.head()

In [None]:
df_augmented.to_csv("Train_augmented_1000.csv", index=False)

In [None]:
for i in range(5) :
    print(f"We have {df_augmented[df_augmented['type']==i].nunique()} distinc samples for class {i}")