In [1]:
!pip -q install transformers==4.10.0
!pip -q install sentencepiece==0.1.94
!pip -q install datasets
!pip install happytransformer

[K     |████████████████████████████████| 2.8 MB 28.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 66.9 MB/s 
[K     |████████████████████████████████| 596 kB 53.8 MB/s 
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
[K     |████████████████████████████████| 895 kB 62.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 16.9 MB/s 
[K     |████████████████████████████████| 325 kB 15.8 MB/s 
[K     |████████████████████████████████| 136 kB 53.3 MB/s 
[K     |████████████████████████████████| 212 kB 55.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 43.7 MB/s 
[K     |████████████████████████████████| 127 kB 60.1 MB/s 
[K     |████████████████████████████████| 271 kB 49.6 MB/s 
[K     |████████████████████████████████| 144 kB 51.3 MB/s 
[K     |████████████████████████████████| 94 kB 3.1 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [2]:
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
import time
import copy
import numpy
import matplotlib.pyplot as plt
from happytransformer import HappyTextToText, TTSettings

In [3]:
from google.colab import drive
drive.mount('/content/drive')

df=pd.read_csv("/content/drive/MyDrive/IT350_dataset/Preprocessed_Train_Dataset.csv")


Mounted at /content/drive


In [4]:
print(df['table'])

0         <page_title> List of 8/9 PM telenovelas of Red...
1         <page_title> List of Chicago Bears first-round...
2         <page_title> Brian Ebersole </page_title> <sec...
3         <page_title> 78th United States Congress </pag...
4         <page_title> Elagabalus </page_title> <section...
                                ...                        
120756    <page_title> 2010–11 Logan Cup </page_title> <...
120757    <page_title> List of compositions by Franz Lis...
120758    <page_title> Tobias Harris </page_title> <sect...
120759    <page_title> WMRQ-FM </page_title> <section_ti...
120760    <page_title> Swimming at the 2012 Summer Olymp...
Name: table, Length: 120761, dtype: object


In [5]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model=torch.load("/content/drive/MyDrive/IT350_dataset/T5model_epoch_7", map_location=device)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

In [7]:
special_tokens_dict = {'pad_token': '<pad>', 'bos_token': '<bos>', 'eos_token': '<eos>', 
                       'additional_special_tokens': ['<PAGESTART>', '<PAGEEND>', '<SECTIONSTART>', '<SECTIONEND>',
                                                     '<TABLESTART>','<TABLEEND>','<CELLSTART>','<CELLEND>','<COLHEADERSTART>',
                                                     '<COLHEADEREND>','<ROWHEADERSTART>','<ROWHEADEREND>']}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('We have added', num_added_toks, 'tokens')
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

We have added 14 tokens


Embedding(32114, 768)

In [31]:
MAXLENI=400
MAXLENO=200

class tottodataset(Dataset):
  def __init__(self,df,tokenizer):
    self.sentence=df['sentence']
    self.table=df['table']
    self.tokenizer=tokenizer

  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,idx):
    inp=(self.table[idx]+'</s>').replace("<page_title>", "<PAGESTART>").replace("</page_title>", "<PAGEEND>") \
                                    .replace("<section_title>", "<SECTIONSTART>").replace("</section_title>", "<SECTIONEND>") \
                                    .replace("<table>", "<TABLESTART>").replace("</table>", "<TABLEEND>") \
                                    .replace("<cell>", "<CELLSTART>").replace("</cell>", "<CELLEND>") \
                                    .replace("<col_header>", "<COLHEADERSTART>").replace("</col_header>", "<COLHEADEREND>") \
                                    .replace("<row_header>", "<ROWHEADERSTART>").replace("</row_header>", "<ROWHEADEREND>")
    out=self.sentence[idx]+'</s>'
    inp_tokens=self.tokenizer.encode_plus(inp, padding="max_length", max_length=MAXLENI, truncation=True)
    out_tokens=self.tokenizer.encode_plus(out, padding="max_length", max_length=MAXLENO, truncation=True)
    inp_id=inp_tokens.input_ids
    out_id=out_tokens.input_ids
    inp_mask=inp_tokens.attention_mask
    out_mask=out_tokens.attention_mask
    labels=out_tokens.input_ids.copy()
    labels=[-100  if x==self.tokenizer.pad_token_id else x for x in labels]

    return {
        "table_text":inp,
        "sentence":out,
        "input_ids":torch.tensor(inp_id, dtype=torch.long),
        "input_attention_mask":torch.tensor(inp_mask, dtype=torch.long),
        "decoder_input_ids":torch.tensor(out_id, dtype=torch.long),
        "decoder_attention_mask":torch.tensor(out_mask, dtype=torch.long),
        "labels":torch.tensor(labels, dtype=torch.long)
    }

In [32]:
test_dataset=tottodataset(df,tokenizer)
test_dataloader=DataLoader(test_dataset,batch_size=64,num_workers=2,shuffle=False)

In [33]:
print(len(test_dataset))
print(len(test_dataloader))

120761
1887


In [34]:
print(test_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7f02324ccb50>


In [11]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32114, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [12]:
!nvidia-smi

Mon Apr 25 15:42:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    27W /  70W |   2610MiB / 15109MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
test_iterator=iter(test_dataloader)
test_batch=next(test_iterator)
generation_output = model.generate(test_batch['input_ids'].to(device), return_dict_in_generate=True, output_scores=True)

In [14]:
generation_output["sequences"]

tensor([[    0,    71, 16717,  ...,  3246,     6,     3],
        [    0,  2158,  4960,  ...,    18,  7775,    57],
        [    0,  7798,   262,  ...,  5390,     3, 22367],
        ...,
        [    0,   461,   668,  ...,  4483,   425,   412],
        [    0, 13197,  1138,  ...,  4837, 14604, 16010],
        [    0, 18973, 17155,  ..., 15465,  1150,  3802]], device='cuda:0')

In [15]:
print(len(generation_output["sequences"]))
n = len(generation_output["sequences"])

64


Grammar Correction

In [16]:
def grammer_corrector(input_text):
    happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
    args = TTSettings(num_beams=5, min_length=1)
    input1 = "grammer: " + input_text
    result = happy_tt.generate_text(input1, args=args)
    return result.text

In [17]:
expected_sentences = []
predicted_sentences = []
for i in range(0,n):
  expected_sentences.append(test_batch["sentence"][i])
  predicted_sentences.append(grammer_corrector(tokenizer.decode(generation_output["sequences"][i],skip_special_tokens=True)))

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

04/25/2022 15:43:41 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:43:53 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:44:05 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:44:16 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:44:28 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:44:39 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:44:51 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:45:02 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:45:13 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:45:25 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:45:36 - INFO - happytransformer.happy_transformer -   Using model: cuda
04/25/2022 15:45:47 - INFO - happytransformer.happy_tr

In [18]:
#import IPython
#IPython.display.HTML(filename=f'/content/drive/MyDrive/IT350_dataset/html_output/example-2.html')

In [19]:
i = 5

print("Expected Sentence :",expected_sentences[i])
print("Predicted Sentence : ",predicted_sentences[i])

Expected Sentence : The Flower Drum Song received six Tony Award nominations, but won only one for the Best Conductor and Musical Director, for Salvatore Dell'Isola.</s>
Predicted Sentence :  Flower Drum Song received seven Tony Award nominations, including Best Musical Director, Best Conductor and Best Musical Director.


Finding similarity between expected and predicted sentence

In [20]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
import numpy as np
import nltk

In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
def cleaning_file(text):    
    tokens = word_tokenize(text)                                # split into words    
    tokens = [w.lower() for w in tokens]                        # convert to lower case    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]             # remove punctuation from each word    
    words = [word for word in stripped if word.isalpha()]       # remove remaining tokens that are not alphabetic
    text = ""
    for i in words:
        text = text + i + " "
    return text

In [23]:
cleaned_expected_sentences = []
cleaned_predicted_sentences = []

for i in range(0,n):
  cleaned_expected_sentences.append(cleaning_file(expected_sentences[i]))
  cleaned_predicted_sentences.append(cleaning_file(predicted_sentences[i]))

Construct k-shingles

In [24]:
def construct_shingles_word_based(k,text):
    shingle = []
    splitted_text = text.split(" ")
    n = len(splitted_text)
    for i in range(0,n-k):
        x = " ".join(splitted_text[i:i+k])
        if(x not in shingle):
            shingle.append(x)   
            
    return shingle

Jaccard word based similarity

In [25]:
def jaccard_similarity_word_based(fx,fy):
    c = 0
    k = 1
    s1 = construct_shingles_word_based(k,fx)
    s2 = construct_shingles_word_based(k,fy)
    for j in s1:
        if(j in s2):
            c = c + 1
    return c/(len(s1)+len(s2)-c)

jaccard_similarity = []
for i in range(0,n):
  jaccard_similarity.append(jaccard_similarity_word_based(cleaned_expected_sentences[i],cleaned_predicted_sentences[i]))

print("Average of jaccard similarity =",sum(jaccard_similarity)/len(jaccard_similarity))
print("Maximum jaccard similarity =",max(jaccard_similarity))
print("Minimum jaccard similarity =",min(jaccard_similarity))

Average of jaccard similarity = 0.45808476090767375
Maximum jaccard similarity = 0.9230769230769231
Minimum jaccard similarity = 0.05555555555555555


Cosine word based similarity

In [26]:
def cosine_similarity_word_based(fx,fy):
    k = 1
    s1 = construct_shingles_word_based(k,fx)
    s2 = construct_shingles_word_based(k,fy)
    union_list = s1.copy()
    union_list.extend(s2)
    union_list = list(set(union_list))
    l1 = []
    l2 = []
    for i in union_list:
        if i in s1:
            l1.append(1)
        else:
            l1.append(0)
        if i in s2:
            l2.append(1)
        else:
            l2.append(0)
    c = 0
    for i in range(0,len(union_list)):
        c = c + l1[i]*l2[i]
    
    return c/float((sum(l1)*sum(l2))**0.5)

cosine_similarity = []
for i in range(0,n):
  cosine_similarity.append(cosine_similarity_word_based(cleaned_expected_sentences[i],cleaned_predicted_sentences[i]))

print("Average of cosine similarity =",sum(cosine_similarity)/len(cosine_similarity))
print("Maximum cosine similarity =",max(cosine_similarity))
print("Minimum cosine similarity =",min(cosine_similarity))

Average of cosine similarity = 0.6125742292789897
Maximum cosine similarity = 0.9607689228305227
Minimum cosine similarity = 0.10540925533894598
