In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
!pip install transformers 
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch.nn as nn 
import torch.nn.functional as F
from torch.optim import AdamW
!pip install contractions
import contractions
import os
import pyarrow.parquet as pq
import re
import time
import gc
from tqdm.notebook import tqdm
from itertools import filterfalse
from tqdm import trange
from transformers import AutoTokenizer, BertForPreTraining,BertForMaskedLM 
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import random

In [None]:
def sliding_window(row, chunk_size=509, overlap=50):
    words = re.findall(r'\b\w+\b', row['post'])
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        start = i
        end = min(i + chunk_size, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
    return pd.DataFrame({'post': chunks})
def expand_contractions(sentence):
  contractions_expanded = [contractions.fix(word) for word in sentence.split()]
  return ' '.join(contractions_expanded)
def lower_case(sentence):
  return ' '.join([word.lower() for word in sentence.split()])
def remove_punctuation(sentence):
  return ' '.join([re.sub(r'[^\w\s]', '', word) for word in sentence.split()])
def preprocess(lst, process=True, min_words=20):
  lst[:] = filterfalse(lambda x: len(x.split()) <= min_words, lst)
  if process == True:
    for i, sent in enumerate(lst):
      # if len(sent.split()) <= min_words:
      #   continue
      lst[i] = lower_case(remove_punctuation(expand_contractions(sent)))
  return lst
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
folder_path = '/kaggle/input/textbooks'
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
dataframes = []

for file in parquet_files:
    file_path = os.path.join(folder_path, file)
    df = pq.read_table(file_path).to_pandas()
    dataframes.append(df)

df_self_sup = pd.concat(dataframes, ignore_index=True)
df_self_sup.rename(columns={"description":"post"},inplace=True)
df_self_sup = pd.concat([sliding_window(row) for _, row in df_self_sup.iterrows()], ignore_index=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
set_seed(42)
start_time = time.time()
train_sentences = preprocess(list(df_self_sup['post']), min_words = 50)
train_df = pd.DataFrame([])
train_df['post'] = train_sentences
del train_sentences, df_self_sup
gc.collect()
# Load the pre-trained BERT model and tokenizer
access_token = "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"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
# model = nn.DataParallel(model).to(device)

In [None]:
#TEST BLOCK FOR MASKING AND RECOVERY
# for batch in dataloader:
#     inputs = batch["input_ids"].squeeze(dim=1)
#     labels = batch["labels"].squeeze(dim=1)

#     # Extract values and convert tensors to lists for batch decoding
#     input_texts = tokenizer.batch_decode(inputs, skip_special_tokens=True)
#     label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     print("Input:")
#     print(input_texts)

#     print("Label:")
#     print(label_texts)
#     break


In [None]:
# extra_id_tokens = tokenizer.additional_special_tokens
# print(extra_id_tokens)

In [None]:
class MaskedLanguageModelingDataset(Dataset):
    def __init__(self, dataframe, tokenizer, mask_probability=0.15, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.mask_probability = mask_probability
        self.max_length = max_length
        self.special_tokens = self.tokenizer.additional_special_tokens 
    def mask_tokens(self, text):
        tokens = text.split()
        masked_tokens = []
        idx = 0
        i = 0
        while i < len(tokens):
            if random.random() < self.mask_probability:
                # Replace all consecutive masked words with a single special token
                current_special_token = self.special_tokens[idx % len(self.special_tokens)]
                masked_tokens.append(current_special_token)
                while i + 1 < len(tokens) and random.random() < self.mask_probability:
                    i += 1
            else:
                masked_tokens.append(tokens[i])
            i += 1
            idx += 1

        masked_text = " ".join(masked_tokens)
        return masked_text

    def complement_tokens(self, text, masked_indices):
        tokens = text.split()
        complement_tokens = []
        idx = 0
        i = 0
        while i < len(tokens):
            if i in masked_indices:
                # Replace all consecutive masked words with a single special token
                current_special_token = self.special_tokens[idx % len(self.special_tokens)]
                complement_tokens.append(current_special_token)
                while i + 1 < len(tokens) and i + 1 in masked_indices:
                    i += 1
            else:
                complement_tokens.append(tokens[i])
            i += 1
            idx += 1

        complement_text = " ".join(complement_tokens)
        return complement_text
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['post']

        # Mask tokens with the given probability
        masked_text = self.mask_tokens(text)

        # Get the indices of masked tokens
        masked_indices = [i for i, token in enumerate(masked_text.split()) if not token.startswith("<extra_id_")]

        # Create complement sentence
        complement_text = self.complement_tokens(text, masked_indices)
  
        # Tokenize the masked text
        input_ids = self.tokenizer(
            masked_text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        ).input_ids

        # Tokenize the complement text
        labels = self.tokenizer(
            complement_text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        ).input_ids

        return {"input_ids": input_ids, "labels": labels}

In [None]:
dataset = MaskedLanguageModelingDataset(train_df, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
epochs = 5
learning_rate = 5e-5

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_accumulation_steps = 32
# Training loop
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    i = 0
    for batch in loop:
        inputs = batch["input_ids"].squeeze(dim=1).to(device)
        labels = batch["labels"].squeeze(dim=1).to(device)
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss/num_accumulation_steps
        loss.backward()
        if (i+1) % num_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
        i+=1
# Save the trained model
model.save_pretrained("unsupervised_t5_model")
tokenizer.save_pretrained("unsupervised_t5_model_tokenizer")