<a href="https://colab.research.google.com/github/Hoshi54/projects_ML/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Классификация текста на датасете ag_news

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import get_scheduler
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
!pip install pymorphy2
!pip install datasets

In [None]:
import torch
from datasets import load_dataset

data = load_dataset('ag_news')
train_data,test_data = data['train'].select(range(20000)),data['test'].select(range(1000))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
import torch
import string
from nltk.corpus import stopwords
import nltk
import pymorphy2
nltk.download('stopwords')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def preprop_text(text):
    punkt_text = text.lower().translate(str.maketrans('','',string.punctuation))
    tokens = nltk.WordPunctTokenizer().tokenize(punkt_text)
    norm_tokens = [pymorphy2.MorphAnalyzer().parse(token)[0].normal_form for token in tokens if token not in set(stopwords.words('english'))]

    return f"[CLS] {' '.join(norm_tokens)}"

from transformers import BertTokenizer,BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
embedder_bert = BertModel.from_pretrained('bert-base-cased').to(device)
embedder_bert.pooler = nn.Identity()
del embedder_bert.encoder.layer[:]

def length_to_mask(length, max_len=None, dtype=None):
    assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
    max_len = max_len or length.max().item()
    mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)

    if dtype is not None:
        mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
    return mask

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(pymorphy2.MorphAnalyzer())

<pymorphy2.analyzer.MorphAnalyzer object at 0x79f772c60340>


In [None]:
from torch.utils.data import Dataset,DataLoader
class dataset(Dataset):
  def __init__(self, hug_dataset, tokenizer, embedder=None, device=device):
    self.hug_dataset = hug_dataset # hugging dataset object
    self.tokenizer = tokenizer
    self.embedder = embedder
    self.device = device

  def __getitem__(self, idx):
    item_dict = self.hug_dataset[idx]
    text = item_dict['text']
    normalized_text = preprop_text(text)
    target = item_dict['label']

    token_ids = self.text_to_tokens_ids(normalized_text)
    tokens_text_len = len(token_ids)

    if self.embedder:
        return self.embedder(token_ids.unsqueeze(0)), target, tokens_text_len

    return token_ids, target

  def text_to_tokens_ids(self, text):
    tokens = self.tokenizer.tokenize(text)

    return torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).to(device)

  def __len__(self):
    return len(self.hug_dataset)


In [None]:
from torch.nn.utils.rnn import pad_sequence
def collate_batch_functional(batch):
  targets_list, embeddings_list, lengths_list = [], [], []

  for (_embed_output, _target, _text_len_in_tokens) in batch:
    _embed = _embed_output.last_hidden_state
    targets_list.append(_target)
    embeddings_list.append(_embed[0])
    lengths_list.append(_text_len_in_tokens)

  targets_tensor = torch.tensor(targets_list, dtype=torch.int64).to(device)
  embeddings_tensor = pad_sequence(embeddings_list, batch_first=True, padding_value=PAD_IND).to(device)
  lengths_tensor = torch.tensor(lengths_list, dtype=torch.int64).to(device)

  return embeddings_tensor.detach(), targets_tensor.detach(), lengths_tensor

train_dataset = dataset(train_data, tokenizer = tokenizer, embedder = embedder_bert)
test_dataset = dataset(test_data, tokenizer = tokenizer, embedder = embedder_bert)

train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 100, collate_fn = collate_batch_functional, drop_last = True)
test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size = 3, collate_fn = collate_batch_functional, drop_last = True)

In [None]:
import torch.nn as nn
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        print(2)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        if mask is not None:
            mask = mask[:, None, None, :]
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim = -1)

        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)

        return x, attention

In [None]:
import torch.nn as nn
class SelfAttentionBasedClassifier(nn.Module):
  def __init__(self, hid_dim, cnt_class = 4, device = device, n_heads = 1):
    super().__init__()
    self.hid_dim = hid_dim
    self.device = device
    self.cnt_class = cnt_class
    print(self.hid_dim)
    self.attn = MultiHeadAttentionLayer(hid_dim=self.hid_dim, n_heads = n_heads, dropout = 0, device = self.device)
    print(1)
    self.classifier_head = nn.Linear(self.hid_dim, self.cnt_class)

  def forward(self, x, mask = None):
    x, attention = self.attn(x, x, x, mask = mask)
    x = x[:, 0, :].squeeze()
    x = self.classifier_head(x)

    return x, attention

In [None]:
test_data

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [None]:
import torch.optim as optim
from tqdm import tqdm_notebook

y = pd.DataFrame(train_data['label'])[0]
from sklearn.utils import compute_class_weight
classes = np.unique(y)
weight = compute_class_weight(classes = classes,class_weight = 'balanced',y = y)
class_weight = dict(zip(classes,weight))


model = SelfAttentionBasedClassifier(768).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight = torch.tensor(weight))
optimizer = optim.Adam(model.parameters(), lr=0.01)

lambda_scheduler = lambda x: 0.96 ** x
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_scheduler)

for epoch in range(10):
    model.train()
    train_acc, total_train, test_acc, total_test = 0,0,0,0
    for i,(input_data, labels, lengths) in tqdm_notebook(enumerate(train_dataloader)):

        mask = length_to_mask(lengths)

        optimizer.zero_grad()
        input_data = input_data.to(device)
        labels = labels.to(device).long()
        outputs, attention = model(input_data, mask = mask)

        loss = loss_fn(outputs, labels)

        _,pred = torch.max(outputs, 1)
        total_train += labels.shape[0]
        train_acc += (pred == labels).sum().item()

        from sklearn.metrics import f1_score
        f1_train = f1_score(labels.cpu(), pred.cpu(), average='weighted')

        from sklearn.metrics import roc_auc_score

        try:
            roc_auc_train = roc_auc_score(labels.cpu(), pred.cpu())
        except ValueError:
            pass

        if i % 100 == 0:
            print('loss_train:{} train_acc:{} f1_train:{}'.format(loss,train_acc / total_train,f1_train))

        loss.backward()
        optimizer.step()


        model.eval()
        with torch.no_grad():
          for (input_data, labels, lengths) in tqdm_notebook(test_dataloader):
            mask = length_to_mask(lengths)

            input_data = input_data.to(device)
            labels = labels.to(device).long()
            outputs, attention = model(input_data, mask=mask)
            _,pred = torch.max(outputs,1)

            total_test += labels.shape[0]
            test_acc += (pred == labels).sum().item()

            f1_test = f1_score(labels.cpu(),pred.cpu(),average = 'weighted')

            try:
              roc_auc_test = roc_auc_score(labels.cpu(),pred.cpu())
            except ValueError:
              pass

            if i % 100 == 0:
              print('loss_test:{} test_acc:{} f1_test:{}'.format(loss,test_acc / total_test,f1_test))



# Курсовая работа

# Поиск по лучам

Пример 1: Принуждение слова

Скажем, что мы пытаемся перевести предложениеи How old are you на Немецкий

Переводом будет Wie alt bist du? это то, что вы бы сказали в неформальной обстановке и Wie alt sind Sie это то, что вы бы сказали в официальной обстановке

И в зависимости от контекста, мы можем захотеть одну форму формальности над другой, но как нам сказать об этой модели?

**Традиционный поиск луча**



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')

encoder_input_str = 'translate English to France: How old are you?'

input_ids = tokenizer(encoder_input_str, return_tensors = 'pt').input_ids

outputs = model.generate(
    input_ids,
    num_beams = 10,
    num_return_sequences = 1,
    no_repeat_ngram_size = 1,
    remove_invalid_values = True
)

tokenizer.decode(outputs[0], skip_special_tokens = True)

# С ограничениенм поиска луча

Но что если бы мы знали, что хотим формальный результат, а не неформальный? Поэтому мы добавляем ограничения через слово Sie

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')

encoder_input_str = 'translate English to German: How old are you?'

force_words = ['Sie']

input_ids = tokenizer(encoder_input_str, return_tensors = 'pt').input_ids
force_words_ids = tokenizer(force_words,add_special_tokens = False).input_ids

outputs = model.generate(
    input_ids,
    force_words_ids = force_words_ids,
    num_beams = 5,
    num_return_sequences = 1,
    no_repeat_ngram_size = 1,
    remove_invalid_values =  True
)

tokenizer.decode(outputs[0],skip_special_tokens = True)

# Пример 2: Дизъюктивные Ограничения

Мы упомянули выше случай использлвания, когда мы знаем, какие слова мы хотим включить в окончательный результат.

Но что если мы знаем, какие словестные формы мы хотим использовать, чтобы у всех них была одинаковая вероятность? В более общем смысле, всегда есть случаи, когда нам не нужно точное слово дословно

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

force_word = 'happy'
force_flexible = ['scream','screams','screaming','screamed']

force_words_ids = [
    tokenizer([force_word],add_prefix_space = True,add_specials_tokens = False).input_ids,
    tokenizer(force_flexible,add_prefix_space = True,add_specials_tokens = False).input_ids
]

starting_text = ['The dog','The cat']

input_ids = tokenizer(starting_text,return_tensors = 'pt').input_ids
outputs = model.generate(
    input_ids,
    force_words_ids = force_words_ids,
    num_beams = 10,
    num_return_sequences = 1,
    no_repeat_ngram_size = 1,
    remove_invalid_values = True
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
tokenizer.decode(outputs[1], skip_special_tokens=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

encoder_input_str = "translate English to German: How old are you?"

constraints = [
    PhrasalConstraint(
        tokenizer("Sie", add_special_tokens=False).input_ids
    )
]

input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=1,
    remove_invalid_values=True,
)


print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset('banking77')

In [None]:
print(len(raw_dataset['train']['text']))
print(len(raw_dataset['test']['text']))

10003
3080


In [None]:
# Для генерации запроса воспользуемся
import random

random_id = random.randrange(len(raw_dataset['train']))
raw_dataset['train'][random_id]

{'text': 'how do i dispute a payment I didnt make', 'label': 16}

In [None]:
# Для обучение модели нреобразовать текст в токен. Это делатся токенизатором специально для каждой модели
# Также он преобразует токены в соответсвтующие им идентификаторы в предобученном словаре

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
  return tokenizer(batch['text'],padding = 'max_length',truncation = True,return_tensors = 'pt')

#raw_dataset = raw_dataset.rename_column('label','labels')
tokenized_dataset = raw_dataset.map(tokenize,batched = True,remove_columns = ['text'])

tokenized_dataset['train'].features.keys()

Map:   0%|          | 0/10003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])

# Fine - tune
После подготовки данных можно начинать обчуение модели. Первым шагом будет загрузка модели с помощью класса AutoModelForSequenceClassification. Так мы создаем модели с весами предобученной модели BERT, но с головой сверху специально под нашу задачу классификации. Здесь мы передаем кол-во классов 77 из нашего набора данных и имена меток

In [None]:
!pip install evaluate

In [None]:
# Fine - tune и оценка модели BERT с помощью Hugging Face Trainer
from transformers import AutoModelForSequenceClassification

labels = tokenized_dataset['train'].features['label'].names
num_labels = len(labels)
label2id,id2label = dict(),dict()
for i,labels in enumerate(labels):
  label2id[labels] = str(i)
  id2label[str(i)] = labels

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = num_labels,
                                                           label2id = label2id,id2label = id2label)
import evaluate
import numpy as np

metric = evaluate.load('f1')

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions,axis = 1)
  return metric.compute(predictions = predictions,references = labels,average = 'weighted')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# Id for remote repository
repository_id = "bert-base-banking77-pt2"

# Define training args
training_args = TrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
	num_train_epochs=3,
	# PyTorch 2.0 specifics
    bf16=True, # bfloat16 training
	torch_compile=True, # optimizations
    optim="adamw_torch_fused", # improved optimizer
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=200,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),

)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here.


In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub()

In [None]:
from transformers import pipeline

# load model from huggingface.co/models using our repository id
classifier = pipeline("sentiment-analysis", model=repository_id, tokenizer=repository_id, device=0)

sample = "I have been waiting longer than expected for my bank card, could you provide information on when it will arrive?"


pred = classifier(sample)
print(pred)

In [None]:
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

messages = [
    {"What is your favourite condiment?"},
    {"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages,return_tensors = 'pt')
model_inputs = encodeds.to(device)
model.to(device)

generated_ids = model.generate(model_inputs,max_new_tokens = 1000,do_sample = True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
!pip install peft

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.29.2 peft-0.10.0


In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

MODEL_NAME = "IlyaGusev/saiga_mistral_7b_lora"

config = PeftConfig.from_pretrained("IlyaGusev/saiga_mistral_7b_lora")
model = AutoModelForCausalLM.from_pretrained(
	config.base_model_name_or_path,
	torch_dtype=torch.float16,
	device_map="auto"
)
model = PeftModel.from_pretrained(
	model,
	"IlyaGusev/saiga_mistral_7b_lora",
	torch_dtype=torch.float16
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga_mistral_7b_lora", use_fast=False)
generation_config = GenerationConfig.from_pretrained("IlyaGusev/saiga_mistral_7b_lora")

def generate(model, tokenizer, prompt, generation_config):
	data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
	data = {k: v.to(model.device) for k, v in data.items()}
	output_ids = model.generate(
    	**data,
    	generation_config=generation_config
	)[0]
	output_ids = output_ids[len(data["input_ids"][0]):]
	output = tokenizer.decode(output_ids, skip_special_tokens=True)
	return output.strip()

PROMT_TEMPLATE = '<s>system\n Ты — МегаДом, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им с их вопросами.</s><s>user\n{inp}</s><s>bot\n'
inp = 'Какое расстояние до Луны?'
prompt = PROMT_TEMPLATE.format(inp=inp)

output = generate(model, tokenizer, prompt, generation_config)

print(output)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Модель с Hugging face
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else 'cpu'
print(device)

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)
model.to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.43-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from langchain import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import os


In [None]:
!pip install -U langchain
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.43-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloa

 Langchain - это библиотека Python для работы с моделями обработки естественного языка (NLP). Она предоставляет инструменты для загрузки, обучения и использования моделей NLP от Hugging Face Hub. Langchain позволяет создавать цепочки обработки текста (chains), которые могут состоять из нескольких моделей для выполнения различных задач NLP.

Hugging Face Hub - это платформа и репозиторий моделей машинного обучения для обработки естественного языка. Она содержит широкий выбор предобученных моделей NLP от различных авторов, которые можно использовать для решения задач обработки текста, классификации, перевода и других.

LLMChain - это класс в библиотеке Langchain, который представляет собой цепочку моделей обработки текста на основе языковых моделей (Language Language Models). Он позволяет объединять несколько моделей NLP в цепочку для выполнения сложных задач обработки текста.

PromptTemplate - это модуль в библиотеке Langchain, который предоставляет шаблоны запросов (prompts) для использования с моделями NLP. Он содержит предопределенные шаблоны запросов, которые можно использовать для формулирования вопросов или задач обработки текста.


In [None]:
import pandas as pd
df = pd.read_csv('/content/df_news.csv')
df.drop('Unnamed: 0',axis = 1,inplace = True)
df['text_len'] = df['Text'].apply(lambda x: len(str(x).split()))
df = df.sample(frac = 1,random_state = 42).reset_index(drop = True)
df['Label'] = df['Label'].map({0:'Politics',1:'Sport',2:'Technology',3:'Entertainment',4:'Business'})

In [None]:
df

Unnamed: 0,Text,Label,text_len
0,David Blunkett in quotes\n \n David Blunkett -...,Politics,289
1,Benitez issues warning to Gerrard\n \n Liverpo...,Sport,158
2,Brookside creator's Channel 4 bid\n \n The cre...,Entertainment,164
3,Brown visits slum on Africa trip\n \n Chancell...,Politics,258
4,Gritty return for Prince of Persia\n \n Still ...,Technology,523
...,...,...,...
2220,Moreno debut makes Oscar mark\n \n Catalina Sa...,Entertainment,220
2221,Musical future for phones\n \n Analyst Bill Th...,Technology,923
2222,What's next for next-gen consoles?\n \n The ne...,Technology,847
2223,Humanoid robot learns how to run\n \n Car-make...,Technology,399


In [None]:
df['Label'].value_counts()

Label
1    511
4    510
0    417
2    401
3    386
Name: count, dtype: int64

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_hugging_hub_token'
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
                                        model_kwargs={"temperature":0.9,
                                                      "top_p":0.95,
                                                      "max_new_tokens": 250,
                                                      "repetition_penalty":1.1})

template = """
Act as a highly intelligent news chatbot and classify the given news text into one of the following categories only 1. Politics 2.Sport 3.Technology 4.Entertainment 5.Business
Do not code. Return only one word answer with only the category name that the given news text belongs to
News text: {news} """

promt = PromptTemplate(input_variables = ['news'],template = template)

chain = LLMChain(llm = llm,prompt = promt)

categories = ['Politics','Sport','Technology','Entertainment','Business']

def output(answer):
  for category in categories:
    if category.lower() in answer.lower():
      return category

news_text = df.iloc[0]['Text']
chain.run(news_text)

In [None]:
!pip install datasets
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=a2a599413ee8c642f9967be74e06c6b5e83db5f7e0026e088a71bf523cd18c96
  Stored in directory: /root/.

In [None]:
device

'cuda'