In [None]:
import re
import random
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

with open('anna_karenina.txt', 'r', encoding='utf-8') as f:
    anna_karenina_text = f.read()

with open('jane_eyre.txt', 'r', encoding='utf-8') as f:
    jane_eyre_text = f.read()

def split_text(text, max_length=100):
    sentences = re.split(r'(?<=\w[.?!])\s', text)  # Dzieli na zdania
    fragments = []
    fragment = []

    for sentence in sentences:
        fragment.append(sentence.strip())
        if len(' '.join(fragment)) >= max_length:  # Osiągnęliśmy maksymalną długość
            fragments.append(' '.join(fragment))
            fragment = []
    return fragments

anna_fragments = split_text(anna_karenina_text)
jane_fragments = split_text(jane_eyre_text)

anna_labels = [0] * len(anna_fragments)
jane_labels = [1] * len(jane_fragments)

texts = anna_fragments + jane_fragments
labels = anna_labels + jane_labels

combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, tokenizer, max_length=100):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_data(train_texts, tokenizer)
test_encodings = encode_data(test_texts, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class BookDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BookDataset(train_encodings, train_labels)
test_dataset = BookDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

def train(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

def evaluate(model, test_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    return accuracy

epochs = 3
for epoch in range(epochs):
    avg_loss = train(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}, Loss: {avg_loss}")

accuracy = evaluate(model, test_loader)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 0.11661796902592511


Zadanie 2

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv("sample.csv")  # Zbiór danych zawiera "comment_text" i "target"

df['target'] = (df['target'] > 0.4).astype(int)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['comment_text'], df['target'], test_size=0.2, random_state=42
)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, tokenizer, max_length=128):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_data(train_texts, tokenizer)
test_encodings = encode_data(test_texts, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tworzenie datasetów
train_dataset = ToxicCommentsDataset(train_encodings, train_labels)
test_dataset = ToxicCommentsDataset(test_encodings, test_labels)

# Tworzenie DataLoaderów
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
# Wczytanie modelu BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

def train(model, train_loader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}")

train(model, train_loader, optimizer)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3, Loss: 0.24563876730855555
Epoch 2/3, Loss: 0.22244788762414827
Epoch 3/3, Loss: 0.2510823638623115


In [None]:
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    print(classification_report(true_labels, predictions, target_names=["Neutral", "Toxic"]))

evaluate(model, test_loader)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

     Neutral       0.93      1.00      0.96      1848
       Toxic       0.83      0.03      0.06       152

    accuracy                           0.93      2000
   macro avg       0.88      0.52      0.51      2000
weighted avg       0.92      0.93      0.89      2000



In [None]:
test_texts = list(test_texts)
model.eval()
with torch.no_grad():
    for i, batch in enumerate(test_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).tolist()

        for idx, pred in enumerate(preds):
            if pred == 1:  # Toksyczne
                print(f"Toxic Comment: {test_texts[i * 8 + idx]}")
            else:  # Neutralne
                print(f"Neutral Comment: {test_texts[i * 8 + idx]}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Neutral Comment: Ya, its almost like we need to do something besides lay off all the state workers. Entitlements cost almost all of that $3.7 billion. So we have 2 choices. reduce entitlements and spend my money on my family, or increase taxes and spend my money on someone else's.
Neutral Comment: Trump is under investigation for his Russian ties, and he just proved that he's a White Supremacist sympathizer, if he isn't one himself.
Neutral Comment: That argument makes no sense, WM. Society moves forward, those that choose not to shouldn't think that those that did have to pay for their defunct lifestyle.
Neutral Comment: Well then I certainly hope you are going to go to your local university the next time a men's rights group or conservative speaker is coming and the SJW's (or "peacocks" as Scott Adams calls them) are screaming and shouting and making threats to try and shut the event down. If you are at Dalhousie the young woman you mention is likely to be there with her pals trying 