In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os
# TODO: change this to the path to your homework folder
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'style_transfer'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive
['sample_data.tsv', 'checkpoints', 'model_saved', '__pycache__', 'data', 'output', 'network_unsupervised.py', 'process_dataset.py', 'supervised_test.py', 'process_nonparallel_dataset.py', 'supervised_train.py', 'network.py', 'Untitled0.ipynb', 'supervised_output_mid24.txt', 'output_test', 'detoxification_metrics.ipynb']


In [2]:
# Load the autoreload extension
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(GOOGLE_DRIVE_PATH)

In [3]:
from process_dataset import *
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

import nltk
nltk.download("punkt")


def load_nonparallel_data(filename):
    train = pd.read_csv(filename)
    print(f"{train.info()}")
    # print(f"{train['comment_text'].isnull().sum()}")
    # for i in range(5):
    #    print(f"{train['comment_text'][i]}")
    text = train["comment_text"].tolist()
    # get all columns other than id and comment_text, i.e. all
    all_labels = train[train.columns.difference(["id", "comment_text"])]
    merged_labels = all_labels.astype(bool).any(axis=1).astype(int).tolist()
    print(f"{text[0]=}, {merged_labels[0]=}")
    print(f"{text[6]=}, {merged_labels[6]=}")
    return text, merged_labels


def print_longer_than(tokenized_data, length):
    num_longer = sum([len(lst) > length for lst in tokenized_data])
    print(f"There are {num_longer} strings of length > {length}")


def generate_nonparallel_dataset(filename, device, discard_threshold=-1):
    """
    discard_threshold is the max number of token a string can contain. Any string
    longer than this will be discarded to reduce the memory requirement.
    """
    texts, labels = load_nonparallel_data(filename)
    print(f"{len(texts)=}, {len(labels)=}")
    tokenized_texts = tokenize(texts)

    for i in range(4):
        print_longer_than(tokenized_texts, 10**i)
        print_longer_than(tokenized_texts, 3 * 10**i)

    # discard all strings that are too long.
    if discard_threshold != -1:
        tokenized_texts, labels = zip(
            *[
                (x, y)
                for x, y in zip(tokenized_texts, labels)
                if len(x) <= discard_threshold
            ]
        )

    w2id, id2w, vocab = get_word_mapping(tokenized_texts)

    toxic_texts = []
    nontoxic_texts = []

    for x, y in zip(tokenized_texts, labels):
        if y == 1:
            toxic_texts.append(x)
        else:
            nontoxic_texts.append(x)

    toxic_text_tensors = sentences_to_tensor(toxic_texts, w2id, vocab, device)
    nontoxic_text_tensors = sentences_to_tensor(nontoxic_texts, w2id, vocab, device)

    print(f"{toxic_text_tensors.size()=}, {nontoxic_text_tensors.size()=}")

    toxic_dataset = TensorDataset(toxic_text_tensors)
    nontoxic_dataset = TensorDataset(nontoxic_text_tensors)
    return toxic_dataset, nontoxic_dataset, w2id, id2w, vocab


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
!pip install detoxify

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting detoxify
  Downloading detoxify-0.5.1-py3-none-any.whl (12 kB)
Collecting transformers==4.22.1
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece>=0.1.94
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (2

In [20]:
from network import *
from supervised_train import *
for _ in range(1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    use_checkpoint = False
    filename = "/content/drive/MyDrive/style_transfer/data/paradetox/paradetox.tsv"
    dataset, w2id, id2w, vocab = generate_dataset(filename, device)
    train_loader, val_loader, test_loader = split_dataset(dataset)
    vocab_size = len(w2id)
    print(vocab_size)

len(texts)=16, len(labels)=16
sentences[0]='he had steel balls too !'
sentences[0]='he was brave too!'
text_tensors.size()=torch.Size([16, 21]), label_tensors.size()=torch.Size([16, 19])
63


In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define the hyperparameters
# There are problems related to EMBEDDING_DIM and HIDDEN_DIM and their related models below
EMBEDDING_DIM = 19
HIDDEN_DIM = 19
NUM_LAYERS = 2
BATCH_SIZE = 4
SEQ_LEN = 21
OUTPUT_LEN = 19
LEARNING_RATE = 0.01
EPOCHS = 200
device = "cuda:0"
# Define the model architecture
class Autoencoder(nn.Module):
    def __init__(self, vocab_size):
        super(Autoencoder, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, device = device)
        self.encoder = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, 
                               num_layers=NUM_LAYERS, batch_first=True, device = device)
        self.attention = nn.Linear(HIDDEN_DIM, HIDDEN_DIM, device = device)
        self.decoder = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=EMBEDDING_DIM,
                               num_layers=NUM_LAYERS, batch_first=True, device = device)
        self.output = nn.Linear(EMBEDDING_DIM, vocab_size, device= device)

    def forward(self, x):
        embedded = self.embedding(x)
        encoder_output, (hidden, cell) = self.encoder(embedded)
        energy = self.attention(encoder_output)
        attention_scores = torch.softmax(energy, dim=1)
        context_vector = torch.bmm(attention_scores.permute(0, 2, 1), encoder_output)
        decoder_output, _ = self.decoder(context_vector, (hidden, cell))
        output = self.output(decoder_output)
        return output
    
# Instantiate the model and the optimizer
model = Autoencoder(vocab_size=vocab_size)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Train the model
for epoch in range(EPOCHS):
    running_loss = 0
    for i, [inputs, targets] in enumerate(train_loader):
        print(inputs)
        print(targets)
        optimizer.zero_grad()
        outputs = model(inputs)
        print(outputs.shape)
        loss = nn.CrossEntropyLoss()(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    print(f"Epoch {epoch + 1}, Loss: {running_loss / (i + 1)}")



[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
tensor([[ 5,  6,  7,  8,  9, 62,  1, 62, 10, 11, 12, 62,  6, 62, 62, 12,  4,  0,
          0],
        [62, 56, 18, 57,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [56, 18, 57,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0]], device='cuda:0')
torch.Size([3, 19, 63])
Epoch 96, Loss: 0.7839555939038595
tensor([[21, 62, 40, 46, 47, 34, 62, 12,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0],
        [48,  9, 59, 27, 49, 50, 33, 49, 50, 51, 52, 53, 12,  4,  0,  0,  0,  0,
          0,  0,  0],
        [18, 39, 40, 62, 41, 28, 42, 43, 28, 62, 62, 62, 44, 45, 12,  4,  0,  0,
          0,  0,  0],
        [48,  9, 59, 27, 49, 50, 33, 49, 50, 51, 52, 53, 12,  4,  0,  0,  0,  0,
          0,  0,  0]], device='cuda:0')
tensor([[13, 62, 40, 46, 47, 34,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [48,  9, 62, 27, 49, 50, 33, 49, 50, 51, 52, 53,  4,  0,  0,  0,

In [46]:
from supervised_train import *
predict(model, train_loader, device, id2w, w2id)

finished prediction
