In [None]:
import os

repo_dir = "nlp-Text-De-toxification"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

# Clone the repository from GitHub
!git clone https://github.com/Goshmar/nlp-Text-De-toxification

nlp-Text-De-toxification already exists. Removing it...

Cloning into 'nlp-Text-De-toxification'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 118 (delta 7), reused 0 (delta 0), pack-reused 90[K
Receiving objects: 100% (118/118), 2.02 MiB | 11.24 MiB/s, done.
Resolving deltas: 100% (38/38), done.


In [None]:

! pip install -r nlp-Text-De-toxification/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.4/677.4 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.8/400.8 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m18.8 MB/s[0m et

In [None]:
import pandas as pd
import requests
import zipfile

# Define the paths
dataset_url = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
zip_file_path = "dataset.zip"
csv_file_path, tsv_file_path = "dataset.csv", "filtered.tsv"

# Download the ZIP file
response = requests.get(dataset_url)
if response.status_code == 200:
    with open(zip_file_path, 'wb') as file:
        file.write(response.content)
else:
    print("Attempt failed")
    exit()

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".")

dataset = pd.read_csv("filtered.tsv", delimiter='\t')
dataset.to_csv(csv_file_path, index=False)

# ZIP cleaning up
os.remove(zip_file_path)
os.remove(tsv_file_path)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split


# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)

# Define the T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Tokenize and preprocess the data
def preprocess_data(data):
    input_text = data['reference'].apply(lambda x: "detoxify: " + x + " </s>")
    target_text = data['translation'].apply(lambda x: x + " </s>")

    input_text = list(input_text)
    target_text = list(target_text)

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    targets = tokenizer(target_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    return inputs, targets

train_inputs, train_targets = preprocess_data(train_data)
val_inputs, val_targets = preprocess_data(val_data)

In [None]:
# Define a function for training the model
def train(model, train_inputs, train_targets, val_inputs, val_targets, num_epochs=5, batch_size=32, learning_rate=1e-4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")

        model.train()
        for i in range(0, len(train_inputs['input_ids']), batch_size):
            input_batch = {key: value[i:i+batch_size].to(device) for key, value in train_inputs.items()}
            target_batch = {key: value[i:i+batch_size].to(device) for key, value in train_targets.items()}

            optimizer.zero_grad()
            loss = model(**input_batch, labels=target_batch['input_ids']).loss
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for i in range(0, len(val_inputs['input_ids']), batch_size):
                input_batch = {key: value[i:i+batch_size].to(device) for key, value in val_inputs.items()}
                target_batch = {key: value[i:i+batch_size].to(device) for key, value in val_targets.items()}

                loss = model(**input_batch, labels=target_batch['input_ids']).loss
                val_loss += loss.item()

        val_loss /= (len(val_inputs['input_ids']) / batch_size)
        print(f"Validation Loss: {val_loss:.4f}")

# Train the model
train(model, train_inputs, train_targets, val_inputs, val_targets, num_epochs=20)

Epoch 1/20
Validation Loss: 1.6122
Epoch 2/20
Validation Loss: 1.5038
Epoch 3/20
Validation Loss: 1.4839
Epoch 4/20
Validation Loss: 1.4191
Epoch 5/20
Validation Loss: 1.3536
Epoch 6/20
Validation Loss: 1.2999
Epoch 7/20
Validation Loss: 1.2604
Epoch 8/20
Validation Loss: 1.1958
Epoch 9/20
Validation Loss: 1.1483
Epoch 10/20
Validation Loss: 1.1075
Epoch 11/20
Validation Loss: 1.0806
Epoch 12/20
Validation Loss: 1.0617
Epoch 13/20
Validation Loss: 1.0501
Epoch 14/20
Validation Loss: 1.0404
Epoch 15/20
Validation Loss: 1.0361
Epoch 16/20
Validation Loss: 1.0318
Epoch 17/20
Validation Loss: 1.0294
Epoch 18/20
Validation Loss: 1.0297
Epoch 19/20
Validation Loss: 1.0313
Epoch 20/20
Validation Loss: 1.0296


In [None]:
# Detoxify a sample sentence
def detoxify_sentence(sentence):
    input_text = "detoxify: " + sentence + " </s>"
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=128).input_ids.to(model.device)

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=128, num_return_sequences=1, no_repeat_ngram_size=2)

    detoxified_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return detoxified_text

dataset = pd.read_csv("/content/nlp-Text-De-toxification/data/interim/dataset_cropped.csv")
for example in dataset['reference'].sample(3):
    print("------")
    print(example)
    print("-->", detoxify_sentence(example))
    print("------\n\n")

------
Great idea, yeah, Cyril. Let's give an M-16 to a bunch of wild Indians!
--> great idea, yeah, cyril. we're going to give a bunch of wild Indians a sixteenth.
------


------
Looks like she left in a hurry, or she's just a filthy pig.
--> looks like she left in a hurry, or she's just a a terrible mess.
------


------
You're stepping on it! - Shut up.
--> be out! you're stepping on it!
------


