In [5]:
import pandas as pd
import random
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [None]:
file_path = "/kaggle/input/wikisplit-new-version/test.tsv"
data_test = pd.read_csv(file_path, sep='<::::>', header=None, names=['unsplit_sentence', 'split_sentence'])

In [None]:
data_test.head()


In [None]:
file_path = "/kaggle/input/wikisplit-new-version/tune.tsv"
data_tune = pd.read_csv(file_path, sep='<::::>', header=None, names=['unsplit_sentence', 'split_sentence'])

In [None]:
data_tune


In [None]:
file_path = "/kaggle/input/wikisplit-new-version/validation.tsv"
data_valid = pd.read_csv(file_path, sep='<::::>', header=None, names=['unsplit_sentence', 'split_sentence'])

In [None]:
data_valid

In [None]:
!pip install pyspellchecker

In [None]:
import random
from spellchecker import SpellChecker

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char if char.isalpha() or char.isspace() else ' ' for char in text])
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
def remove_random_characters(word):
    if len(word) > 6:
        num_deletions = 2
    else:
        num_deletions = 1

    indices = random.sample([i for i, char in enumerate(word) if char != ' '], num_deletions)

    new_word = [char if i not in indices else '' for i, char in enumerate(word)]

    new_word = ''.join(new_word)

    return new_word

In [None]:
def add_random_characters(word):
    index = random.randint(0, len(word))
    new_char = random.choice(string.ascii_lowercase)
    new_word = word[:index] + new_char + word[index:]
    return new_word

In [None]:
def replace_random_characters(word):
    if len(word) > 1:
        num_replacements = random.randint(1, 2)

        indices = random.sample([i for i, char in enumerate(word) if char != ' '], num_replacements)
        new_chars = [random.choice(string.ascii_lowercase) for _ in range(num_replacements)]
        new_word = ''.join([new_chars.pop(0) if i in indices else char for i, char in enumerate(word)])
        return new_word
    else :
        return word

In [None]:
def swap_adjacent_characters(word):
    if len(word) > 1:
        index = random.randint(0, len(word) - 2)
        new_word = word[:index] + word[index+1] + word[index] + word[index+2:]
        return new_word
    else :
        return word

In [None]:
def introduce_spelling_errors(text):
    new_text = []
    words = text.split()
    num_words_to_corrupt = int(len(words) * 0.3)
    if num_words_to_corrupt == 0:
        num_words_to_corrupt = 1
    words_to_corrupt = random.sample(words, num_words_to_corrupt)

    for word in words:
        if word in words_to_corrupt:
            error_types = ['deletion', 'insertion', 'swapping', 'replacement']
            error_type = random.choice(error_types)

            if error_type == 'deletion':
                error_word = remove_random_characters(word)
            elif error_type == 'insertion':
                error_word = add_random_characters(word)
            elif error_type == 'swapping':
                error_word = swap_adjacent_characters(word)
            elif error_type == 'replacement':
                error_word = replace_random_characters(word)

            new_text.append(error_word)
        else:
            new_text.append(word)

    return ' '.join(new_text)

In [None]:
data_tune['unsplit_sentence'] = data_tune['unsplit_sentence'].apply(preprocess_text)
data_valid['unsplit_sentence'] = data_valid['unsplit_sentence'].apply(preprocess_text)
data_test['unsplit_sentence'] = data_test['unsplit_sentence'].apply(preprocess_text)


# data_tune = pd.concat([data_tune]*3, ignore_index=True)
# data_valid = pd.concat([data_valid]*3, ignore_index=True)
# data_test = pd.concat([data_test]*3, ignore_index=True)

data_tune['misspelled_unsplit_sentence'] = data_tune['unsplit_sentence'].apply(introduce_spelling_errors)
data_valid['misspelled_unsplit_sentence'] = data_valid['unsplit_sentence'].apply(introduce_spelling_errors)
data_test['misspelled_unsplit_sentence'] = data_test['unsplit_sentence'].apply(introduce_spelling_errors)


data_tune.drop('split_sentence', axis=1, inplace=True)
data_valid.drop('split_sentence', axis=1, inplace=True)
data_test.drop('split_sentence', axis=1, inplace=True)

In [None]:
data_tune

In [None]:
data_tune_tokenized = data_tune.rename(columns={"misspelled_unsplit_sentence": "input_text", "unsplit_sentence": "target_text"})
data_valid_tokenized = data_valid.rename(columns={"misspelled_unsplit_sentence": "input_text", "unsplit_sentence": "target_text"})
data_test_tokenized = data_test.rename(columns={"misspelled_unsplit_sentence": "input_text", "unsplit_sentence": "target_text"})

In [None]:
import torch
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Define tokenization function
def tokenize_function(example):
    return tokenizer(example["input_text"], example["target_text"], padding="max_length", truncation=True)

# Convert pandas DataFrames to datasets
train_dataset = Dataset.from_pandas(data_tune_tokenized)
valid_dataset = Dataset.from_pandas(data_valid_tokenized)
test_dataset = Dataset.from_pandas(data_test_tokenized)

# Tokenize datasets
data_tune = train_dataset.map(tokenize_function, batched=True)
data_valid = valid_dataset.map(tokenize_function, batched=True)
data_test = test_dataset.map(tokenize_function, batched=True)

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="/kaggle/working/logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_tune,
    eval_dataset=data_valid,
)

trainer.train()

trainer.evaluate(data_test)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd

data_tune_sample = data_tune.sample(n=500, random_state=42)
data_valid_sample = data_valid.sample(n=500, random_state=42)
data_test_sample = data_test.sample(n=500, random_state=42)

# Step 2: Tokenize data and prepare for training
tokenizer = T5Tokenizer.from_pretrained("t5-small")

train_encodings = tokenizer(list(data_tune['misspelled_unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)
train_labels_encodings = tokenizer(list(data_tune['unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)

val_encodings = tokenizer(list(data_valid['misspelled_unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)
val_labels_encodings = tokenizer(list(data_valid['unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)

class SpellingCorrectionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels_encodings):
        self.encodings = encodings
        self.labels_encodings = labels_encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels_encodings["input_ids"][idx]
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SpellingCorrectionDataset(train_encodings, train_labels_encodings)
val_dataset = SpellingCorrectionDataset(val_encodings, val_labels_encodings)

# Step 3: Fine-tune T5 model
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='/kaggle/working/logs',
)

model = T5ForConditionalGeneration.from_pretrained("t5-small")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Replace -100 in the labels as we can't decode them.
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id

    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    return {"bleu": bleu_score(pred_str, label_str)}

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda data: {
        "input_ids": torch.stack([f["input_ids"] for f in data]),
        "labels": torch.stack([f["labels"] for f in data]),
    },
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Now, let's handle the test data

test_encodings = tokenizer(list(data_test['misspelled_unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)
test_labels_encodings = tokenizer(list(data_test['unsplit_sentence']), return_tensors="pt", padding=True, truncation=True)

test_dataset = SpellingCorrectionDataset(test_encodings, test_labels_encodings)

# Evaluate the fine-tuned model on the test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)