<a href="https://colab.research.google.com/github/Lisavetti/DaTaAnalys_1/blob/main/lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization, MultiHeadAttention, LayerNormalization, Dense, Embedding, Dropout
from tensorflow.keras.models import Model
import re
import string
import random

In [15]:
text_file = 'ukr.txt'

In [16]:
# Load and prepare the data
with open("ukr.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

text_pairs = []
for line in lines:
    eng, ukr, _ = line.strip().split("\t")
    ukr = "[start] " + ukr + " [end]"
    text_pairs.append((eng, ukr))

In [17]:
for _ in range(5):
    print(random.choice(text_pairs))

('Who was Tom afraid of?', '[start] Кого боявся Том? [end]')
('What a hot day!', '[start] Який спекотний день! [end]')
('Why are you so good at cooking?', '[start] Чому ти так добре готуєш? [end]')
("I'm still not sure I can help you.", '[start] Я й досі не певен, що можу вам допомогти. [end]')
("That's Tom's watch.", '[start] Це годинник Тома. [end]')


In [18]:

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]


print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

158705 total pairs
111095 training pairs
23805 validation pairs
23805 test pairs


In [19]:

# Vectorization
vocab_size = 15000
sequence_length = 20

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length)
ukr_vectorization = TextVectorization(max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length + 1, standardize=custom_standardization)

train_eng_texts = [pair[0] for pair in train_pairs]
train_ukr_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
ukr_vectorization.adapt(train_ukr_texts)

In [20]:
# Transformer Encoder Layer
class TransformerEncoderLayer(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.attention(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.dense_proj(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [21]:
# Transformer Decoder Layer
class TransformerDecoderLayer(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, inputs, enc_output, training):
        attn_output1 = self.attention1(inputs, inputs)
        attn_output1 = self.dropout1(attn_output1, training=training)
        out1 = self.layernorm1(inputs + attn_output1)
        attn_output2 = self.attention2(out1, enc_output)
        attn_output2 = self.dropout2(attn_output2, training=training)
        out2 = self.layernorm2(out1 + attn_output2)
        ffn_output = self.dense_proj(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)


In [23]:
# Build the Transformer model
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoderLayer(embed_dim, num_heads, latent_dim)(x)

decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = layers.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(decoder_inputs)
x = TransformerDecoderLayer(embed_dim, num_heads, latent_dim)(x, encoder_outputs)
decoder_outputs = Dense(vocab_size, activation="softmax")(x)

transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")

# Compile the model
transformer.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Prepare datasets
def format_dataset(eng, ukr):
    eng = eng_vectorization(eng)
    ukr = ukr_vectorization(ukr)
    return ({"encoder_inputs": eng, "decoder_inputs": ukr[:, :-1]}, ukr[:, 1:])

# Reduce the number of epochs
epochs = 3

# Increase batch size if your hardware can handle it
batch_size = 128

def make_dataset(pairs, batch_size):
    eng_texts, ukr_texts = zip(*pairs)
    dataset = tf.data.Dataset.from_tensor_slices((list(eng_texts), list(ukr_texts)))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

train_ds = make_dataset(train_pairs, batch_size)
val_ds = make_dataset(val_pairs, batch_size)

transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7ef163d9cfa0>

In [27]:


# Assuming ukr_vectorization is for Ukrainian and eng_vectorization is for English
ukr_vocab = ukr_vectorization.get_vocabulary()
ukr_index_lookup = dict(zip(range(len(ukr_vocab)), ukr_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ukr_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ukr_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence.replace("[start]", "").strip()

# Test the model with a few examples
test_eng_texts = [pair[0] for pair in test_pairs]  # Assuming test_pairs is a list of English-Ukrainian pairs
for _ in range(5):  # Testing with 5 random sentences
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(f"EN: {input_sentence}\nUKR: {translated}\n")

# Example translation
print("Example translation of 'hi':")
print(decode_sequence('hi'))


EN: Is this book yours?
UKR: Так твоя книжка [end]

EN: I'm surprised at your behavior.
UKR: Боюся ваш улюблений напій [end]

EN: Let him do it.
UKR: Дайно ним [end]

EN: Learning a foreign language is a waste of time.
UKR: Час вивчати мови [end]

EN: Nobody saw anything.
UKR: Ніхто нічого бачив [end]

Example translation of 'hi':
Привіт Томе [end]


In [1]:
pip install transformers



In [4]:
pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [6]:
from transformers import pipeline

# Список моделей
uk_models = [
    "csebuetnlp/mT5_multilingual_XLSum", # mT5 для багатомовної сумаризації, включаючи українську
    "Geotrend/bert-base-uk-cased",        # BERT специфічно для української мови
    "IlyaGusev/rut5_base_sum_gazeta_uk",  # T5 для сумаризації українських новин
    "sberbank-ai/ruRoberta-large"         # RoBERTa для російської, але може бути застосована до української
]

# Створення пайплайнів
pipelines = {}
for model in uk_models:
    try:
        if "sum" in model:
            pipelines[model] = pipeline("summarization", model=model)
        else:
            pipelines[model] = pipeline("fill-mask", model=model)
    except Exception as e:
        print(f"Не вдалося створити пайплайн для моделі {model}: {e}")

# Приклад використання пайплайну для сумаризації
text_for_summarization = "Новий рік принесе щастя і мир"
try:
    summarized_text = pipelines["csebuetnlp/mT5_multilingual_XLSum"](text_for_summarization)
    print("Сумаризований текст:", summarized_text)
except Exception as e:
    print(f"Помилка при сумаризації тексту: {e}")

# Приклад використання пайплайну для заповнення пропущених слів
text_for_fill_mask = "Вітаю з [MASK] роком"
try:
    filled_text = pipelines["Geotrend/bert-base-uk-cased"](text_for_fill_mask)
    print("Текст із заповненими пропусками:", filled_text)
except Exception as e:
    print(f"Помилка при заповненні пропущених слів: {e}")



Не вдалося створити пайплайн для моделі csebuetnlp/mT5_multilingual_XLSum: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
Не вдалося створити пайплайн для моделі IlyaGusev/rut5_base_sum_gazeta_uk: IlyaGusev/rut5_base_sum_gazeta_uk is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Помилка при сумаризації тексту: 'csebuetnlp/mT5_multilingual_XLSum'
Текст із заповненими пропусками: [{'score': 0.26641497015953064, 'token': 10709, 'token_str': 'новим', 'sequence': 'Вітаю з новим роком'}, {'score': 0.2534734308719635, 't

In [1]:
pip install torch torchvision




In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.utils import save_image

# Hyperparameters
batch_size = 128
image_size = 28
nz = 100  # Size of z latent vector (i.e., size of generator input)
ngf = 64  # Size of feature maps in generator
ndf = 64  # Size of feature maps in discriminator
num_epochs = 5
lr = 0.0002
beta1 = 0.5

# Check for device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load the dataset
dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the Generator
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            # Input is Z, going into a convolution
            nn.ConvTranspose2d(nz, ngf * 4, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # State size. (ngf*4) x 4 x 4
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 3, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # State size. (ngf*2) x 7 x 7
            nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # State size. (ngf) x 14 x 14
            nn.ConvTranspose2d(ngf, 1, 4, 2, 1, bias=False),
            nn.Tanh()
            # State size. (1) x 28 x 28
        )

    def forward(self, input):
        return self.main(input)

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            # Input size. (1) x 28 x 28
            nn.Conv2d(1, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # State size. (ndf) x 14 x 14
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # State size. (ndf*2) x 7 x 7
            nn.Conv2d(ndf * 2, 1, 7, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input).view(-1, 1).squeeze(1)

# Create the generator and discriminator
netG = Generator().to(device)
netD = Discriminator().to(device)

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors
fixed_noise = torch.randn(64, nz, 1, 1, device=device)

# Establish convention for real and fake labels
real_label = 1
fake_label = 0

# Setup Adam optimizers
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))
# Training Loop
for epoch in range(num_epochs):
    for i, data in enumerate(dataloader, 0):
        # Train with all-real batch
        netD.zero_grad()
        real_cpu = data[0].to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
        output = netD(real_cpu).view(-1)
        errD_real = criterion(output, label)
        errD_real.backward()
        D_x = output.mean().item()

        # Train with all-fake batch
        noise = torch.randn(b_size, nz, 1, 1, device=device)
        fake = netG(noise)
        label.fill_(fake_label).type(torch.float)
        output = netD(fake.detach()).view(-1)
        errD_fake = criterion(output, label)
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        errD = errD_real + errD_fake
        optimizerD.step()

        # Update G network
        netG.zero_grad()
        label.fill_(real_label).type(torch.float)
        output = netD(fake).view(-1)
        errG = criterion(output, label)
        errG.backward()
        D_G_z2 = output.mean().item()
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(dataloader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Check how the generator is doing by saving G's output on fixed_noise
        if (epoch % 1 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            save_image(fake, f'output_epoch_{epoch}.png', nrow=8, normalize=True)

print("Training Finished.")



[0/5][0/469]	Loss_D: 1.3723	Loss_G: 1.1063	D(x): 0.4579	D(G(z)): 0.4342 / 0.3396
[0/5][50/469]	Loss_D: 0.1411	Loss_G: 3.4175	D(x): 0.9439	D(G(z)): 0.0759 / 0.0343
[0/5][100/469]	Loss_D: 0.7024	Loss_G: 2.1098	D(x): 0.7765	D(G(z)): 0.3369 / 0.1382
[0/5][150/469]	Loss_D: 0.9629	Loss_G: 1.3009	D(x): 0.6121	D(G(z)): 0.3478 / 0.2830
[0/5][200/469]	Loss_D: 0.8009	Loss_G: 1.3481	D(x): 0.6668	D(G(z)): 0.3006 / 0.2751
[0/5][250/469]	Loss_D: 0.6211	Loss_G: 2.3467	D(x): 0.8150	D(G(z)): 0.3283 / 0.1081
[0/5][300/469]	Loss_D: 0.6651	Loss_G: 1.8665	D(x): 0.7698	D(G(z)): 0.3114 / 0.1701
[0/5][350/469]	Loss_D: 0.5416	Loss_G: 1.8864	D(x): 0.7617	D(G(z)): 0.2232 / 0.1634
[0/5][400/469]	Loss_D: 0.7310	Loss_G: 2.0892	D(x): 0.8399	D(G(z)): 0.4160 / 0.1352
[0/5][450/469]	Loss_D: 0.6749	Loss_G: 1.5568	D(x): 0.6722	D(G(z)): 0.2224 / 0.2258
[1/5][0/469]	Loss_D: 0.8306	Loss_G: 0.9983	D(x): 0.5840	D(G(z)): 0.2170 / 0.3858
[1/5][50/469]	Loss_D: 0.7757	Loss_G: 1.1618	D(x): 0.6755	D(G(z)): 0.2942 / 0.3344
[1/5][100/