In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List

import seaborn

In [4]:
nltk.download('punkt')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from datasets import load_dataset
dataset = load_dataset('IlyaGusev/gazeta',revision = 'v1.0')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5770 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5265 [00:00<?, ? examples/s]

In [5]:
sent = []

from tqdm import tqdm_notebook
for sentence in tqdm_notebook(dataset['validation']['text']):
  sent.extend(
      [x.lower() for x in sent_tokenize(sentence,language = 'russian') if len(x) < 128]
  )

from collections import Counter
chars = Counter()
for sentence in tqdm_notebook(sent):
  for char in sentence:
    chars[char] += 1

vocab = set(['<unk>','<bos>','<eos>','<pad>'])
for char,cnt in chars.items():
  if cnt > 500:
    vocab.add(char)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(dataset['validation']['text']):


  0%|          | 0/5265 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(sent):


  0%|          | 0/122237 [00:00<?, ?it/s]

In [None]:
char2ind = {char: i for i, char in enumerate(vocab)}
ind2char = {i: char for char, i in char2ind.items()}

In [17]:
sum(chars.values())

9777898

In [None]:
class CharDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = char2ind['<unk>']
        self.bos_id = char2ind['<bos>']
        self.eos_id = char2ind['<eos>']
        self.pad_id = char2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [char2ind.get(char, self.unk_id) for char in self.data[idx]]
        tokenized_sentence += [self.eos_id]

        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)

def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=char2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

train_sentences, eval_sentences = train_test_split(sent, test_size=0.2)

train_dataset = CharDataset(train_sentences)
eval_dataset = CharDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

In [None]:
class CharLM(nn.Module):
    def __init__(self, hidden_dim: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, hidden_dim)
        self.projection = nn.Linear(hidden_dim, vocab_size)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_batch) -> torch.Tensor:
        embeddings = self.embedding(input_batch)
        output, _ = self.rnn(embeddings)
        output = self.dropout(self.linear(self.non_lin(output)))
        projection = self.projection(self.non_lin(output))

        return projection

import torch.optim as optim
model = CharLM(hidden_dim=256, vocab_size=len(vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=char2ind['<pad>'])
optimizer = optim.Adam(model.parameters(),lr = 0.01)

In [None]:
def evaluate(model, criterion) -> float:
    model.eval()
    perplexity = []
    with torch.no_grad():
        for batch in eval_dataloader:
            logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
            loss = criterion(
                logits,
                batch['target_ids'].flatten()
                )
            perplexity.append(torch.exp(loss).item())

    perplexity = sum(perplexity) / len(perplexity)

    return perplexity

In [None]:
num_epoch = 10
losses = []
perplexities = []

for epoch in range(num_epoch):
    epoch_losses = []
    model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
        loss = criterion(
            logits, batch['target_ids'].flatten())
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())

    losses.append(sum(epoch_losses) / len(epoch_losses))
    perplexities.append(evaluate(model, criterion))

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

In [None]:
def generate_sequence(model, starting_seq: str, max_seq_len: int = 128) -> str:
    device = 'cpu'
    model = model.to(device)
    input_ids = [char2ind['<bos>']] + [
        char2ind.get(char, char2ind['<unk>']) for char in starting_seq]
    input_ids = torch.LongTensor(input_ids).to(device)

    model.eval()
    with torch.no_grad():
        for i in range(max_seq_len):
            next_char_distribution = model(input_ids)[-1]
            next_char = next_char_distribution.squeeze().argmax()
            input_ids = torch.cat([input_ids, next_char.unsqueeze(0)])

            if next_char.item() == char2ind['<eos>']:
                break

    words = ''.join([ind2char[idx.item()] for idx in input_ids])

    return words

In [None]:
generate_sequence(model, starting_seq='по системе ')

'<bos>по системе произошедшего с настоящее время на себя в состоянии в состоянии в состоянии в состоянии.<eos>'

Модель недообучена и следует повторение N-грамм, пробовал поиграть с beam-search, но не задалось, пожтому решил взять предобученную модель

In [1]:
!pip install langchain peft datasets

Collecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.0-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitte

In [2]:
pip install zstandard jsonlines

Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: zstandard, jsonlines
Successfully installed jsonlines-4.0.0 zstandard-0.22.0


In [None]:
import os
os._exit(00)

In [3]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer,AutoModel
import torch.nn.functional as F
from langchain.prompts import PromptTemplate

adapt_model_name = 'IlyaGusev/saiga_mistral_7b_lora'
base_model_name = 'Open-Orca/Mistral-7B-OpenOrca'

tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code = True
)

tokenizer.pad_token = tokenizer.eos_token
device_map = {'':0}

model = AutoPeftModelForCausalLM.from_pretrained(
    adapt_model_name,
    device_map = device_map,
    torch_dtype = torch.bfloat16
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/623 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

In [4]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer,AutoModel
import torch.nn.functional as F
from langchain.prompts import PromptTemplate

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 0
all model parameters: 7255379968
percentage of trainable model parameters: 0.00%


In [5]:
sent_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sent_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Необязательная часть, если нужно подготовить датасет к дообучению

In [None]:
from datasets import Dataset
import pandas as pd

def generate_promt(data_point):
  promt = f"""<s>system отвечай на запросы пользователей</s><s>user
  {data_point['text']}</s><s>bot
  {data_point['ans']}</s>"""
  return promt

def generate_tokenize(data_point,add_eos_token = True):
  full_prompt = generate_promt(data_point)
  result = tokenizer(full_prompt,
                                    truncation = True,
                                    max_length = 3500,
                                    padding = False,
                                    return_tensors = None)

  if (
        result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < 3500
        and add_eos_token
    ):

        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1) # Добавляем бит внимания


  result["labels"] = result["input_ids"].copy()

  return result

train_dataset = Dataset.from_pandas(
    pd.DataFrame({"text": dataset['train']['instruction'], "ans": dataset['train']['output']})
)

train_dataset = train_dataset.map(generate_tokenize)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling,TrainingArguments,Trainer

for param in model.parameters():
    param.requires_grad = True

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path = "/content/Sidorova_Realnye-istorii-ot-idei-do-biznesa_RuLit_Me.txt",
    block_size = 128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = False
)

training_args = TrainingArguments(
    output_dir = "./noth",
    overwrite_output_dir = True,
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    save_steps = 10000,
    save_total_limit = 2,
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

trainer.train()

model.save_pretrained("your_finetuned_model")

In [13]:
from transformers import GenerationConfig
import time

start_time = time.time()

def get_embedding(sentence):
  def _mean_pooling(model_output,attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

  encoded_input = sent_tokenizer([sentence],padding = True, truncation = True, return_tensors = 'pt')

  with torch.no_grad():
    model_output = sent_model(**encoded_input)

  sentence_embeddings = _mean_pooling(model_output,encoded_input['attention_mask'])
  sentence_embeddings = F.normalize(sentence_embeddings,p = 2,dim = 1)

  return sentence_embeddings

answers = []
emb_database = torch.empty((0,384),dtype = torch.float32)

info_prompt = PromptTemplate.from_template("user: Описание работы {question}\nbot:Твоя заадча отвечать на вопросы пользователей:")

class Conversation:
  def __init__(
      self,
      message_template = '<s>{role}\n{content}</s>',
      system_promt = 'Ты - Ольга, помогай людям с их проблемами и отвечай на вопросы',
      response_template = '<s>{bot}<s>'
  ):
      self.message_template = message_template
      self.response_template = response_template
      self.messages = [{
          'role': 'system',
          'content': system_promt
      }]

  def add_user_message(self,message):
      self.messages.append({
          'role': 'user',
          'content': message
      })

  def add_bot_message(self,message):
      self.messages.append({
          'role':'bot',
          'content': message
      })

  def get_prompt(self, tokenizer):
        final_text = ""
        for message in self.messages:
            message_text = self.message_template.format(**message)
            final_text += message_text
        final_text += "<s>bot\n"
        return final_text.strip()

model = model.to('cuda')
def get_answer(info_prompt,question,generate_config):
  prompt = info_prompt.format(question = question)
  inputs = tokenizer(prompt, return_tensors = 'pt',add_special_tokens = False)
  outputs = model.generate(input_ids = inputs['input_ids'].to('cuda'),
                           top_p = 0.5,
                           temperature = 0.3,
                           attention_mask = inputs['attention_mask'].to('cuda'),
                           max_new_tokens = 500,
                           pad_token_id = tokenizer.eos_token_id,
                           do_sample = True,
                           )[0]

#  outputs = outputs[len(inputs['inputs_ids'][0]):]
  output = tokenizer.decode(outputs,skip_specials_tokens = True)
  parsed_answer = output.split('Твоя заадча отвечать на вопросы пользователей:')[1]

  if 'bot:' in parsed_answer:
    parsed_answer = parsed_answer.split('bot:')[0]

  tokenized_text = tokenizer.encode(output, return_tensors = 'pt').to('cuda')

  return parsed_answer

def get_cos_sim(question):
  cos_sim = F.cosine_similarity(emb_database.to('cuda'), emb.to('cuda'), dim=1, eps=1e-8)
  return cos_sim

question = "Что такое инновационный процесс?"
#conversation = Conversation()
#conversation.add_user_message(question)
#prompt = conversation.get_prompt(sent_tokenizer)

generation_config = GenerationConfig.from_pretrained('IlyaGusev/saiga_mistral_7b_lora')
emb = get_embedding(question)
get_cos_sim(question)

answer = get_answer(info_prompt,question,generation_config)
emb_database = torch.cat((emb_database,emb),0)
answers.append(answer)

end_time = time.time()
execution_time = round(end_time - start_time)

print(f'{execution_time} секунд')
print(answer)



46 секунд


Инновационный процесс - это система управления, которая предполагает постоянное создание, разработку и внедрение новых идей, технологий и продуктов. Этот процесс включает в себя все этапы, начиная с идеи и заканчивая внедрением и коммерциализацией.

Основные этапы инновационного процесса:

1. Идея: создание и формулирование идей для новых продуктов, технологий или процессов.
2. Разработка: разработка и тестирование идей, чтобы определить их потенциал и возможность внедрения.
3. Финансирование: получение финансовой поддержки для реализации идеи.
4. Внедрение: внедрение идеи в реальность, включая производство, маркетинг и продажи.
5. Коммерциализация: коммерциализация продукта или технологии, то есть получение дохода от их использования.

Инновационный процесс может быть применен в различных сферах, таких как образование, наука, бизнес и т.д.</s><s> 
