In [2]:
import sys
!{sys.executable} -m pip install -q transformers langdetect

In [57]:
import pandas as pd

# Fetch the data

In [51]:
data = pd.read_csv("train.csv", nrows=25000, lineterminator='\n', on_bad_lines='warn') # Fetching subset of data for faster training

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           25000 non-null  object
 1   Description  25000 non-null  object
 2   Summary     25000 non-null  object
dtypes: object(3)
memory usage: 586.1+ KB


In [53]:
data.head(10)

Unnamed: 0,ID,Description,Summary\r
0,HONEUSHRD71328EXTERNALCZCZ,"Join a team recognized for leadership, innovat...","Achieves successful on-site installation, serv..."
1,HILTGLOBALHOT08QSYEXTERNALZHAPAC,为客人提供优质的服务，介绍水疗服务及水疗产品的知识。确保预订系统运作正常。 . 作为水疗中心...,希尔顿在全球 100 多个国家和地区拥有数以千计的酒店，提供无数令人愉悦的机会。 从敞开的大...
2,HIINGLOBALHOT08MXLEXTERNALENGLOBAL,In addition to performance of the essential fu...,The individual must possess the following know...
3,ACCOGLOBAL22022956ENGLOBAL,<html>.Overnight 5th Class Power Engineer. As ...,"As a certified 5th Class Power Engineer, you w..."
4,FIGLUSJR0225226EXTERNAL,<html>.Performs tasks to ensure compliance wit...,Performs tasks to ensure compliance with work ...
5,COGRAU48055EXTERNALENAU,You’ll find us working across all business uni...,"As a Space Manager for Meat, Bakery, Deli and ..."
6,ONINGLOBALP-130297en_GLOBAL,<h1>Must be a respectful communicator</h1>. Mu...,Must have excellent attendance. Must demonstra...
7,KSNEUS49144,Charitable Giving database management: trackin...,"Monthly sponsorship report, export data and fo..."
8,GENEUS202208131555EXTERNALENUS,<h2>Der Learning & Growth Hub der Roche Pharma...,Juli 2022 einen Praktikanten (m/w/d) im Bereic...
9,HIINGLOBALHOT08N63EXTERNALENGLOBAL,Shift:Various - must be available to work days...,"As a Cook III, you would be responsible for pr..."


In [63]:
data.rename(columns={"Summary\r":"Summary"}, inplace=True)

In [58]:
## Clean special characters as they have very less role in summarization of data

def remove_specialchar(df):
  df["Description"] = df['Description'].str.replace(r'[^a-zA-Z0-9 :\.]','')
  df["Summary"] = df['Summary'].str.replace(r'[^a-zA-Z0-9 :\.]','')
  return df

In [59]:
def remove_htmltags(df):
  df['Description'] = df['Description'].str.replace(r'<[^<>]*>', '', regex=True)
  df['Summary'] = df['Summary'].str.replace(r'<[^<>]*>', '', regex=True)
  return df

In [60]:
def clean_data(df):
  df = remove_htmltags(df)
  df = remove_specialchar(df)
  return df

In [61]:
def language(text):
  try:
    lang = detect(text)
    return lang
  except:
    return "non"

In [74]:
from langdetect import detect
def detect_language(df):

  df["lang"] = df["Description"].apply(language)
  return df

In [75]:
data = clean_data(data)

In [76]:
data = data.dropna()

In [77]:
data = detect_language(data)

In [78]:
data = data[data["lang"]=="en"]

In [79]:
data.head(5)

Unnamed: 0,ID,Description,Summary,lang
0,HONEUSHRD71328EXTERNALCZCZ,"Join a team recognized for leadership, innovat...","Achieves successful on-site installation, serv...",en
2,HIINGLOBALHOT08MXLEXTERNALENGLOBAL,In addition to performance of the essential fu...,The individual must possess the following know...,en
3,ACCOGLOBAL22022956ENGLOBAL,.Overnight 5th Class Power Engineer. As a cert...,"As a certified 5th Class Power Engineer, you w...",en
4,FIGLUSJR0225226EXTERNAL,.Performs tasks to ensure compliance with work...,Performs tasks to ensure compliance with work ...,en
5,COGRAU48055EXTERNALENAU,You’ll find us working across all business uni...,"As a Space Manager for Meat, Bakery, Deli and ...",en


In [80]:
# data.to_csv("cleaned_data1.csv", index=False)

In [42]:
data=pd.read_csv("cleaned_data1.csv", nrows=5000)

In [43]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

In [44]:
base_model="google-t5/t5-small"

In [45]:
tokenizer = T5Tokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration.from_pretrained(base_model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [46]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [47]:
class CustomData(Dataset):
  def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text="Description", target_text="Summary"
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

  def __len__(self):
        """returns the length of dataframe"""
        return len(self.target_text)

  def __getitem__(self, idx):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[idx])
        target_text = str(self.target_text[idx])

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

In [48]:
data = data[["Description", "Summary"]]

train_size = 0.8
train_dataset = data.sample(frac=train_size)
val_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomData(
    train_dataset,
    tokenizer,
    250,
    100,
    "Description",
    "Summary",
)
val_set = CustomData(
    val_dataset,
    tokenizer,
    250,
    100,
    "Description",
    "Summary",
)

In [49]:
training_loader = DataLoader(training_set, batch_size= 32, shuffle= True)
val_loader = DataLoader(val_set, batch_size = 16, shuffle= False)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e3)

In [None]:
from tqdm import tqdm

model = model.to(device)
for epoch in range(10):
    # Train using pytorch framework
    model.train()
    train_loss = 0
    for idx, input in tqdm(enumerate(training_loader, 0)):
        y = input["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous() # removed last token from decoder input
        lm_labels = y[:, 1:].clone().detach() # start the target label from 2nd term as 1st term we take as input and predict the 2nd term
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 # To avoid padding causing issues with loss cal
        ids = input["source_ids"].to(device, dtype=torch.long)
        mask = input["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"epoch {epoch} trainign loss = {train_loss/len(training_loader)}")
    """
    Function to evaluate model for predictions
    """
    model.eval()
    val_loss = 0
    for idx, input in enumerate(val_loader, 0):
        y = input["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous() # removed last token from decoder input
        lm_labels = y[:, 1:].clone().detach() # start the target label from 2nd term as 1st term we take as input and predict the 2nd term
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 # To avoid padding causing issues with loss cal
        ids = input["source_ids"].to(device, dtype=torch.long)
        mask = input["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]
        val_loss+=loss.item()
    print(f"epoch {epoch} trainign loss = {train_loss/len(val_loader)}")


In [54]:
test_df = pd.read_csv("test.csv", nrows=50)

In [66]:
test_df.head()

Unnamed: 0,ID,Description
0,KUNAGLOBALREQ91464EXTERNALHUHU,"With a keen interest in the development field,..."
1,BOMOGLOBALR220007643EXTERNALFRCA,.Delivers exceptional service to BMO customers...
2,KSNEUS85616,The preferred candidate will be able to provid...
3,PRHEUSR1050648ENUSEXTERNAL,The Care Management Representative functions a...
4,KSNEUS34436,As a Warehouse Associate you will be operate d...


In [68]:
source = tokenizer.batch_encode_plus(
            test_df[["Description"]],
            max_length=250,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

In [86]:
def validate(tokenizer, model, device, loader):
  """
  Function to evaluate model for predictions
  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, input in enumerate(loader, 0):
        print("hello")
        ids = input['input_ids'].to(device, dtype = torch.long)
        mask = input['attention_mask'].to(device, dtype = torch.long)

        generated_ids = model.generate(
            input_ids = ids,
            attention_mask = mask,
            max_length=150,
            num_beams=1,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True
            )
        print("hello")
        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

        predictions.extend(preds)
  return predictions

In [85]:
training_loader = DataLoader(source, batch_size= 16, shuffle= True)

In [87]:
predictions, actuals = validate(tokenizer, model, device, training_loader)
final_df = pd.DataFrame({"ID": test_df["ID"],"Generated Summary": predictions, "actual Summary": actuals})


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [None]:
## Evaluation

In [None]:
def lcs(predict, target, m, n):

    if m == 0 or n == 0:
        return 0
    elif predict[m-1] == target[n-1]:
        return 1 + lcs(predict, target, m-1, n-1)
    else:
        return max(lcs(predict, target, m, n-1), lcs(predict, target, m-1, n))


In [None]:
score = []
for idx, row in final_df.iterrows():
  score.append(lcs(row["Generated Summary"], row["actual Summary"], len(row["Generated Summary"]), len(row["actual Summary"])))

In [None]:
final_df["score"] = score

In [None]:
final_df.to_csv("predictions.csv", index=False)

In [67]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop