<a href="https://colab.research.google.com/github/HuaiyuZhang/DeepLearning/blob/main/proto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [3]:
import torch
import torch.nn as nn
from transformers import AutoModel

class CustomTransformer(nn.Module):
    def __init__(self, transformer_model, num_classes=2):
        super(CustomTransformer, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model)
        self.event_embedding = nn.Embedding(num_embeddings=len("abcdef") + 1, embedding_dim=self.transformer.config.hidden_size)
        self.time_embedding = nn.Linear(1, self.transformer.config.hidden_size)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_classes)

    def forward(self, event_sequences, time_sequences, attention_mask=None):
        event_embeds = self.event_embedding(event_sequences)
        time_embeds = self.time_embedding(time_sequences.unsqueeze(-1))
        combined_embeds = event_embeds + time_embeds

        transformer_outputs = self.transformer(inputs_embeds=combined_embeds, attention_mask=attention_mask)
        pooled_output = transformer_outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled_output)

        return logits

transformer_model = "bert-base-uncased"
model = CustomTransformer(transformer_model)


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import AdamW
from sklearn.model_selection import train_test_split
import numpy as np

import random
import numpy as np

def generate_fake_event_sequence(length):
    return ''.join(random.choices("abcdef", k=length))

def generate_fake_time_sequence(length):
    return [random.uniform(0, 1) for _ in range(length)]

def generate_fake_dataset(num_samples, sequence_length, true_ratio=0.001):
    event_sequences = [generate_fake_event_sequence(sequence_length) for _ in range(num_samples)]
    time_sequences = [generate_fake_time_sequence(sequence_length) for _ in range(num_samples)]
    labels = np.random.choice([0, 1], size=num_samples, p=[1-true_ratio, true_ratio])

    return event_sequences, time_sequences, labels

num_samples = 10000
sequence_length = 6

event_sequences, time_sequences, labels = generate_fake_dataset(num_samples, sequence_length)

# Encode the event sequences into integers
event_to_idx = {event: idx + 1 for idx, event in enumerate("abcdef")}
event_sequences_encoded = [[event_to_idx[event] for event in sequence] for sequence in event_sequences]

event_sequences_tensor = torch.tensor(event_sequences_encoded, dtype=torch.long)
time_sequences_tensor = torch.tensor(time_sequences, dtype=torch.float)
labels_tensor = torch.tensor(labels, dtype=torch.long)

class BinaryClassificationDataset(Dataset):
    def __init__(self, event_sequences, time_sequences, labels):
        self.event_sequences = event_sequences
        self.time_sequences = time_sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.event_sequences[idx], self.time_sequences[idx], self.labels[idx]

dataset = BinaryClassificationDataset(event_sequences_tensor, time_sequences_tensor, labels_tensor)

# Split the data into training and validation sets
train_indices, val_indices = train_test_split(np.arange(len(labels)), test_size=0.2, stratify=labels, random_state=42)

train_sampler = RandomSampler(train_indices)
val_sampler = RandomSampler(val_indices)

train_dataloader = DataLoader(dataset, batch_size=32, sampler=train_sampler)
val_dataloader = DataLoader(dataset, batch_size=32, sampler=val_sampler)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, optimizer, and training parameters
num_epochs = 3
class_weights = torch.tensor([1000, 1], dtype=torch.float).to(device) # Adjust weights according to your class imbalance
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        event_batch, time_batch, labels_batch = tuple(t.to(device) for t in batch)

        optimizer.zero_grad()
        logits = model(event_batch, time_batch)
        loss = criterion(logits, labels_batch)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validation
    model.eval()
    total_val_loss = 0

    for batch in val_dataloader:
        event_batch, time_batch, labels_batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(event_batch, time_batch)
            loss = criterion(logits, labels_batch)

        total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")




Epoch 1/3: Train Loss = 0.0047, Val Loss = 0.0000
Epoch 2/3: Train Loss = 0.0001, Val Loss = 0.0000
Epoch 3/3: Train Loss = 0.0000, Val Loss = 0.0000
