In [None]:
import os
import shutil

def delete_folder_and_files(folder_path):
    # Проверяем, существует ли указанный путь
    if not os.path.exists(folder_path):
        print(f"Путь {folder_path} не существует.")
        return

    # Удаляем все файлы внутри папки
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.remove(file_path) # Используем os.remove для удаления файлов
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path) # Используем shutil.rmtree для удаления папок
        except Exception as e:
            print(f'Ошибка при удалении {file_path}. Причина: {e}')

    # Проверяем, остались ли какие-либо файлы или папки внутри
    if not os.listdir(folder_path):
        # Если папка пуста, удаляем её
        os.rmdir(folder_path)
        print(f"Папка {folder_path} успешно удалена.")
    else:
        print(f"Папка {folder_path} не пуста, удаление невозможно.")

# Пример использования функции
folder_path = ['/home/jupyter/datasphere/project/model.ckpt','/home/jupyter/datasphere/project/DeepSeek_1,3b','/home/jupyter/datasphere/project/models','/home/jupyter/datasphere/project/checkpoint_best_1','/home/jupyter/datasphere/project/tb_logs','/home/jupyter/datasphere/project/lightning_logs','/home/jupyter/datasphere/project/datasetscache','/home/jupyter/datasphere/project/modelcache']
for elem in folder_path:
    delete_folder_and_files(elem)

In [None]:
%pip install comet-ml
import comet_ml

In [None]:
%pip install -q transformers datasets

In [None]:
%pip install --upgrade awscli
%pip install --upgrade boto3
%pip install --upgrade git+https://github.com/dask/s3fs

In [None]:
from datasets import  DatasetDict,load_dataset
train_dataset = load_dataset("code_x_glue_ct_code_to_text", "php", split="train")
test_dataset = load_dataset("code_x_glue_ct_code_to_text", "php", split="test[:3000]")
validation_dataset = load_dataset("code_x_glue_ct_code_to_text", "php", split="validation[:2000]")
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")
max_input_length = 115
max_target_length = 115
prefix = 'Summerize this PHP code:'
def preprocess_examples(examples):
  # encode the code-docstring pairs
    codes = examples['code']
    docstrings = examples['docstring']
    inputs = [prefix + code for code in codes]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  # encode the summaries
    labels = tokenizer(docstrings, max_length=max_target_length, padding="max_length", truncation=True).input_ids
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 and label!=32014 and label!=185 and label!=31 and label!=13 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example) 
    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs

In [None]:
dataset = dataset.map(preprocess_examples, batched=True)

In [None]:
from torch.utils.data import DataLoader
dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'],  batch_size=1)
valid_dataloader = DataLoader(dataset['validation'], batch_size=1)
test_dataloader = DataLoader(dataset['test'], batch_size=2)

In [None]:
%pip install sacrebleu

In [None]:
from transformers import AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup
import sacrebleu
from sklearn.metrics import accuracy_score
import torch
import nltk
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")

In [None]:
import time
import torch
import nltk
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.optim import AdamW
from torch.optim import SGD
from torch.optim.lr_scheduler import CyclicLR
from torch.optim.lr_scheduler import OneCycleLR
from nltk.translate.bleu_score import corpus_bleu,SmoothingFunction
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pytorch_lightning as pl
from torch.nn.utils import weight_norm

class DeepSeek(pl.LightningModule):
    def __init__(self, momentum = 0.88, lr=8e-5, weight_decay = 1e-1, num_train_epochs=50, warmup_steps=1000,max_lr=2e-4):
        super().__init__()
        self.tokenizer = tokenizer
        self.model = model
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        return loss, outputs.logits

    def training_step(self, batch, batch_idx):
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)
        start_time.record()
        loss, logits = self.common_step(batch, batch_idx)
        labels = batch["labels"]
        preds = torch.argmax(logits, dim=-1)
        self.log("training_loss", loss, on_epoch=True)
        bleu = self.calculate_bleu(labels, preds)
        self.log("training_bleu", bleu, on_epoch=True)
        end_time.record()
        torch.cuda.synchronize()
        elapsed_time = start_time.elapsed_time(end_time)
        self.log("testing_time_per_batch", elapsed_time)
        return loss

    def prediction_step(self,batch,batch_idx):
        loss, logits = self.common_step(batch, batch_idx)
        labels = batch["labels"]
        preds = torch.argmax(logits, dim=-1)
        print(self.ids_to_text(preds) ,labels)
        print(self.calculate_bleu(labels, preds))

    def validation_step(self, batch, batch_idx):
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)
        start_time.record()
        loss, logits = self.common_step(batch, batch_idx)
        labels = batch["labels"]
        preds = torch.argmax(logits, dim=-1)
        self.log("validation_loss", loss, on_epoch=True)
        bleu = self.calculate_bleu(labels, preds)
        end_time.record()
        torch.cuda.synchronize()
        elapsed_time = start_time.elapsed_time(end_time)
        self.log("testing_time_per_batch", elapsed_time)
        self.log("validation_bleu", bleu, on_epoch=True)
        return loss

    def test_step(self, batch, batch_idx):
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)
        start_time.record()
        loss, logits = self.common_step(batch, batch_idx)
        labels = batch["labels"]
        preds = torch.argmax(logits, dim=-1)
        self.log("testing_loss", loss)
        bleu = self.calculate_bleu(labels, preds)
        self.log("testing_bleu", bleu)
        end_time.record()
        torch.cuda.synchronize()
        elapsed_time = start_time.elapsed_time(end_time)
        self.log("testing_time_per_batch", elapsed_time)
        return loss
    
    def bleu_score_metric(epoch, step):
        return self.trainer.callback_metrics.get('validation_bleu', 0.0)
    
    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

    def calculate_bleu(self, targets, predictions):
        bleu_scores = []
        smoother = SmoothingFunction()
        for target, prediction in zip(targets, predictions):
            reference = target.cpu().numpy().tolist()
            hypothesis = prediction.cpu().numpy().tolist()
            bleu_score = corpus_bleu([[reference]], [hypothesis], smoothing_function=smoother.method5)
            bleu_scores.append(bleu_score)
        return sum(bleu_scores)/len(bleu_scores)

    def configure_optimizers(self):
        # Определите группы параметров для модели
            optimizer = AdamW(
            self.model.parameters(),
            lr=self.hparams.lr,
            betas = (0.91,0.9999),
            weight_decay=self.hparams.weight_decay,
        )
            lr_scheduler = CosineAnnealingLR(optimizer, T_max=5, eta_min = 1e-5, last_epoch=-1, verbose=True)
            return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": "epoch",
                "frequency": 1,
                "monitor": "validation_bleu"
            }
        }
    
    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:
model = DeepSeek()

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
import torch
from pytorch_lightning.callbacks.stochastic_weight_avg import StochasticWeightAveraging
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CometLogger
early_stop = EarlyStopping(
    monitor = 'validation_bleu',
    patience = 2,
    mode = 'max',
)
checkpoint_callback = ModelCheckpoint(
    monitor='validation_bleu',  # Используйте вашу метрику здесь
    mode='max',  # Лучшее значение - это максимальное значение метрики
    save_top_k=1,  # Сохраняем только одну лучшую модель
    filename='best',  # Имя файла для сохранения
    dirpath='models',  # Путь к директории для сохранения
    verbose=True,  # Выводить сообщения о процессе сохранения 
)

logger = CometLogger(
  api_key="cczFfSGpZlad44ZgJP96GKW5U",
  project_name="DeepSeek_1,3b"
)
lr_monitor = LearningRateMonitor(logging_interval='step')
trainer = Trainer(gradient_clip_algorithm="norm",log_every_n_steps = 150,gradient_clip_val = 1,min_epochs = 10,enable_checkpointing = False,accumulate_grad_batches=16,callbacks = [early_stop,lr_monitor],logger = logger)
trainer.fit(model)

Epoch 1:  56%|█████▋    | 136216/241241 [5:22:30<4:08:39,  7.04it/s, v_num=2828]

In [None]:
trainer.validate(model)

In [None]:
trainer.test(model)

In [None]:
model.eval()

In [None]:
to_test = ['train','test','validation']
pred = []
truth = []
for elem in to_test:
  test_final_dataloader = DataLoader(dataset[elem], batch_size=1)
  example = next(iter(test_final_dataloader))

  # Получите предсказание модели
  with torch.no_grad():
      output = model.forward(**example)
  # Получите идентификаторы предсказанных токенов
  pred_ids = torch.argmax(output.logits, dim=-1)
  ground_truth = example['labels']
  pred_ids = [[i for i in j if i!=0] for j in pred_ids]
  # Преобразуйте идентификаторы токенов в текст
  pred_text = [tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_ids if ids!=32014 and ids!=0]
  ground_truth_list =  torch.tensor([value for value in ground_truth.squeeze().tolist() if value != 32014 and value!=-100 and value!=0])
  decoded_text = tokenizer.decode(ground_truth_list)
  # Выведите результат
  pred.append(''.join(pred_text))
  truth.append(decoded_text)

In [None]:
for i in range(3):
  print('Generated:' ,(''.join([i for i in pred[i] if i!='\n'])),len(pred[i]))
  print('Correct: ',truth[i])
  print('\n')


In [None]:
trainer.save_checkpoint("model.ckpt")

In [None]:
# Открываем файл для записи
with open('output.txt', 'w') as file:
    for i in range(3):
        # Формируем строку для записи
        generated_text = ''.join([i for i in pred[i] if i!='\n'])
        generated_length = len(pred[i])
        correct_text = truth[i]
        output_string = f'Generated: {generated_text}, Length: {generated_length}\nCorrect: {correct_text}\n\n'
        
        # Записываем строку в файл
        file.write(output_string)

In [None]:
checkpoint_path = '/home/jupyter/datasphere/project/model.ckpt'

# Создание экземпляра модели
model_best = DeepSeek()

# Загрузка весов модели из checkpoint
checkpoint = torch.load(checkpoint_path)

In [None]:
model.load_state_dict(checkpoint['state_dict'])

In [None]:
model.eval()

In [None]:
to_test = ['train','test','validation']
pred = []
truth = []
for elem in to_test:
  test_final_dataloader = DataLoader(dataset[elem], batch_size=1)
  example = next(iter(test_final_dataloader))

  # Получите предсказание модели
  with torch.no_grad():
      output = model.forward(**example)
  # Получите идентификаторы предсказанных токенов
  pred_ids = torch.argmax(output.logits, dim=-1)
  ground_truth = example['labels']
  pred_ids = [[i for i in j if i!=0] for j in pred_ids]
  # Преобразуйте идентификаторы токенов в текст
  pred_text = [tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_ids if ids!=32014 and ids!=0]
  ground_truth_list =  torch.tensor([value for value in ground_truth.squeeze().tolist() if value != 32014 and value!=-100 and value!=0])
  decoded_text = tokenizer.decode(ground_truth_list)
  # Выведите результат
  pred.append(''.join(pred_text))
  truth.append(decoded_text)

In [None]:
for i in range(3):
  print('Generated:' ,(''.join([i for i in pred[i] if i!='\n'])),len(pred[i]))
  print('Correct: ',truth[i])
  print('\n')


In [None]:
with open('output_best.txt', 'w') as file:
    for i in range(3):
        # Формируем строку для записи
        generated_text = ''.join([i for i in pred[i] if i!='\n'])
        generated_length = len(pred[i])
        correct_text = truth[i]
        output_string = f'Generated: {generated_text}, Length: {generated_length}\nCorrect: {correct_text}\n\n'
        
        # Записываем строку в файл
        file.write(output_string)

In [None]:
info_dataset = load_dataset("code_x_glue_ct_code_to_text", "php", split="train[:2600]")

In [None]:
length = [len(i) for i in info_dataset['code']]

In [None]:
length.index(max(length))

In [None]:
info_dataset['code'][2086]

In [None]:
import numpy as np

# Сортировка массива по возрастанию
sorted_length = sorted(length)

# Медианное значение
median = np.median(sorted_length)

# Минимальное значение
min_value = np.min(sorted_length)

# Максимальное значение
max_value = np.max(sorted_length)

# Среднее значение
mean = np.mean(sorted_length)

# Стандартное отклонение
std_dev = np.std(sorted_length)

# Диапазон значений
range_value = max_value - min_value

# Количество элементов в массиве
num_elements = len(sorted_length)

# Квартили
Q1 = np.percentile(sorted_length, 25)
Q2 = np.percentile(sorted_length, 50)
Q3 = np.percentile(sorted_length, 75)

print("Медианное значение:", median)
print("Минимальное значение:", min_value)
print("Максимальное значение:", max_value)
print("Среднее значение:", mean)
print("Стандартное отклонение:", std_dev)
print("Диапазон значений:", range_value)
print("Количество элементов в массиве:", num_elements)
print("Первый квартиль (Q1):", Q1)
print("Второй квартиль (Q2):", Q2)
print("Третий квартиль (Q3):", Q3)