# Работа с датасетом

Был выбран датасет на английском языке, содержащий твиты, связанные с финансами:

https://huggingface.co/datasets/zeroshot/twitter-financial-news-topic

In [None]:
import pandas as pd

splits = {'train': 'topic_train.csv', 'validation': 'topic_valid.csv'}
df = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-topic/" + splits["train"])
df_valid = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-topic/" + splits["validation"])

df.head(3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0


In [None]:
# приводим все к нижнему регистру
df["tokens"] = df.text.apply(str.lower)
df_valid["tokens"] = df_valid.text.apply(str.lower)

df_valid.head()

Unnamed: 0,text,label,tokens
0,Analyst call of the day for @CNBCPro subscribe...,0,analyst call of the day for @cnbcpro subscribe...
1,"Loop upgrades CSX to buy, says it's a good pla...",0,"loop upgrades csx to buy, says it's a good pla..."
2,BofA believes we're already in a recession — a...,0,bofa believes we're already in a recession — a...
3,JPMorgan sees these derivative plays as best w...,0,jpmorgan sees these derivative plays as best w...
4,Morgan Stanley's Huberty sees Apple earnings m...,0,morgan stanley's huberty sees apple earnings m...


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
# токенизация, лемматизация
df["tokens"] = df.tokens.apply(lambda row: [token.lemma_ for token in nlp(row)])
df_valid["tokens"] = df_valid.tokens.apply(lambda row: [token.lemma_ for token in nlp(row)])
df.head()

Unnamed: 0,text,label,tokens
0,Here are Thursday's biggest analyst calls: App...,0,"[here, be, thursday, 's, big, analyst, call, :..."
1,Buy Las Vegas Sands as travel to Singapore bui...,0,"[buy, las, vegas, sand, as, travel, to, singap..."
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,"[piper, sandler, downgrade, docusign, to, sell..."
3,"Analysts react to Tesla's latest earnings, bre...",0,"[analyst, react, to, tesla, 's, late, earning,..."
4,Netflix and its peers are set for a ‘return to...,0,"[netflix, and, its, peer, be, set, for, a, ', ..."


In [None]:
import string
# удаляем пунктуацию
df["tokens"] = df.tokens.apply(lambda row: [token for token in row if token not in string.punctuation + string.digits + '...'])
df_valid["tokens"] = df_valid.tokens.apply(lambda row: [token for token in row if token not in string.punctuation + string.digits + '...'])

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stops = stopwords.words("english")

df["tokens"] = df.tokens.apply(lambda row: " ".join([token for token in row if token not in stops]))
df_valid["tokens"] = df_valid.tokens.apply(lambda row: " ".join([token for token in row if token not in stops]))
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,text,label,tokens
0,Here are Thursday's biggest analyst calls: App...,0,thursday 's big analyst call apple amazon tesl...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sand travel singapore builds wel...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrade docusign sell cite ele...
3,"Analysts react to Tesla's latest earnings, bre...",0,analyst react tesla 's late earning break next...
4,Netflix and its peers are set for a ‘return to...,0,netflix peer set return growth analyst say giv...


# LinearSVC

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import *

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
vectorized_x_train = vectorizer.fit_transform(df["tokens"])

clf = LinearSVC()
clf.fit(vectorized_x_train, df["label"])
vectorized_x_test = vectorizer.transform(df_valid["tokens"])
pred = clf.predict(vectorized_x_test)
print(classification_report(df_valid["label"], pred))

              precision    recall  f1-score   support

           0       0.88      0.62      0.73        73
           1       0.88      0.83      0.85       214
           2       0.78      0.90      0.84       852
           3       0.95      0.73      0.82        77
           4       0.97      0.98      0.97        97
           5       0.89      0.95      0.92       242
           6       0.77      0.82      0.80       146
           7       0.85      0.83      0.84       160
           8       0.83      0.75      0.79        32
           9       0.83      0.72      0.77       336
          10       0.58      0.54      0.56        13
          11       0.92      0.79      0.85        14
          12       0.87      0.79      0.83       119
          13       0.88      0.57      0.69       116
          14       0.80      0.85      0.82       415
          15       0.85      0.75      0.80       125
          16       0.92      0.93      0.93       249
          17       0.92    

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Оценка качества классификации
accuracy = accuracy_score(df_valid["label"], pred)
precision = precision_score(df_valid["label"], pred, average='weighted')  # 'weighted' для многоклассового случая
recall = recall_score(df_valid["label"], pred, average='weighted')
f1 = f1_score(df_valid["label"], pred, average='weighted')
conf_matrix = confusion_matrix(df_valid["label"], pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.83
Precision: 0.84
Recall: 0.83
F1 Score: 0.83
Confusion Matrix:
[[ 45   0  11   0   0   2   0   1   0   0   0   0   0   0   3   2   0   0
    3   6]
 [  1 177   7   1   0   0   0   0   0   3   0   0   0   0  13   1   4   2
    5   0]
 [  1   2 767   0   0   6   6   4   0  11   1   1   3   4  12   1   1   5
   24   3]
 [  0   4   5  56   0   0   2   0   0   2   0   0   0   2   1   1   0   0
    4   0]
 [  0   0   0   0  95   2   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   2   0   1 230   0   2   0   0   0   0   0   0   1   1   0   0
    4   1]
 [  0   0   4   0   0   0 120   1   0   1   1   0   0   0  13   3   1   0
    2   0]
 [  0   1   5   1   0   9   1 133   0   0   1   0   0   0   4   0   0   0
    3   2]
 [  0   2   0   0   0   0   0   0  24   0   0   0   0   0   2   0   0   0
    4   0]
 [  0   4  42   0   0   0   6   0   4 241   0   0   2   0  19   2   5   0
   11   0]
 [  0   0   0   0   0   0   2   0   0   0   7   0   0   0   2   1   1   0

# BERT без дообучения

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.svm import LinearSVC
from sklearn.metrics import *

# Загрузка токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Функция для получения эмбеддингов BERT для текста
def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():  # BERT без дообучения, поэтому градиенты не считаем
        outputs = model(**inputs)
    # Используем эмбеддинг [CLS] токена как представление текста
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# Преобразуем тексты в эмбеддинги BERT
train_embeddings = [get_bert_embeddings(text, tokenizer, model) for text in df["tokens"]]
valid_embeddings = [get_bert_embeddings(text, tokenizer, model) for text in df_valid["tokens"]]

# Обучение модели LinearSVC на эмбеддингах BERT
new_clf = LinearSVC()
new_clf.fit(train_embeddings, df["label"])

# Оценка модели на валидационном наборе
pred = new_clf.predict(valid_embeddings)
print(classification_report(df_valid["label"], pred))



              precision    recall  f1-score   support

           0       0.50      0.51      0.50        73
           1       0.69      0.69      0.69       214
           2       0.76      0.77      0.76       852
           3       0.64      0.51      0.57        77
           4       0.95      0.97      0.96        97
           5       0.85      0.88      0.87       242
           6       0.68      0.73      0.70       146
           7       0.73      0.71      0.72       160
           8       0.74      0.62      0.68        32
           9       0.62      0.63      0.63       336
          10       0.50      0.62      0.55        13
          11       0.64      0.50      0.56        14
          12       0.79      0.81      0.80       119
          13       0.66      0.54      0.60       116
          14       0.75      0.73      0.74       415
          15       0.69      0.66      0.67       125
          16       0.86      0.86      0.86       249
          17       0.80    

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Оценка качества классификации
accuracy = accuracy_score(df_valid["label"], pred)
precision = precision_score(df_valid["label"], pred, average='weighted')  # 'weighted' для многоклассового случая
recall = recall_score(df_valid["label"], pred, average='weighted')
f1 = f1_score(df_valid["label"], pred, average='weighted')
conf_matrix = confusion_matrix(df_valid["label"], pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.74
Precision: 0.73
Recall: 0.74
F1 Score: 0.73
Confusion Matrix:
[[ 37   2   7   0   0   3   0   1   0   2   0   1   0   1   0   2   0   2
    7   8]
 [  1 147   8   2   0   2   5   2   0   5   0   0   1   0  18   3   7   5
    5   3]
 [  8   5 660   5   3  16   7  13   0  28   2   1   7  20  11   3   3   7
   32  21]
 [  0   7   5  39   0   1   3   1   0   3   1   0   1   3   5   2   0   0
    4   2]
 [  0   0   1   0  94   1   0   1   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2   0   8   0   2 214   0   1   0   2   0   0   0   0   4   1   0   0
    7   1]
 [  1   3   2   1   0   1 106   1   0   6   1   0   2   0  10   3   0   1
    6   2]
 [  3   4  16   1   0   2   0 113   0   1   3   0   0   0   3   1   0   0
    6   7]
 [  0   3   0   3   0   0   0   0  20   2   0   0   0   0   2   0   0   0
    1   1]
 [  4   3  37   2   0   0   8   3   2 211   0   0   5   1  21   1  11   1
   22   4]
 [  1   1   0   0   0   0   0   0   0   0   8   0   0   0   1   1   0   0

 # BERT c дообучением

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from sklearn.metrics import accuracy_score

# Загрузка токенизатора и модели для классификации
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=20)

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")
        inputs = {key: val.squeeze() for key, val in inputs.items()}
        return {**inputs, "labels": torch.tensor(label)}

train_dataset = CustomDataset(df["tokens"].tolist(), df["label"].tolist(), tokenizer)
valid_dataset = CustomDataset(df_valid["tokens"].tolist(), df_valid["label"].tolist(), tokenizer)

# Параметры обучения
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch",
)

# Функция для вычисления метрик
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics  # Добавлено вычисление метрик
)

# Запуск обучения
trainer.train()

# Оценка модели на валидационном наборе
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")




model.safetensors:  95%|#########5| 419M/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6692,0.579358,0.840661


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6692,0.579358,0.840661
2,0.3459,0.535677,0.877338
3,0.2124,0.568967,0.883167


Evaluation results: {'eval_loss': 0.5356768369674683, 'eval_accuracy': 0.8773378673791595, 'eval_runtime': 31.8116, 'eval_samples_per_second': 129.418, 'eval_steps_per_second': 16.189, 'epoch': 3.0}


# zero-shot-classification

Используем transformers.pipeline для инициализации zero-shot классификатора и задаем классы для классификации:

In [None]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Задаем zero-shot классификатор
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda")

# Метки классов
# candidate_labels = list(df['label'].unique())
candidate_labels = [
    "Analyst Update", "Fed | Central Banks", "Company | Product News", "Treasuries | Corporate Debt",
    "Dividend", "Earnings", "Energy | Oil", "Financials", "Currencies", "General News | Opinion",
    "Gold | Metals | Materials", "IPO", "Legal | Regulation", "M&A | Investments", "Macro",
    "Markets", "Politics", "Personnel Change", "Stock Commentary", "Stock Movement"
]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Теперь классифицируем каждый твит валидационного датасета:

In [None]:
predictions = []
for text in tqdm(df_valid["tokens"].tolist()):
    result = classifier(text, candidate_labels, multi_label=False)  # multi_label=False для одного класса
    predictions.append(candidate_labels.index(result["labels"][0]))  # Сохраняем индекс предсказанной метки


  0%|          | 0/4117 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Сравним предсказания с истинными метками валидационного датасета:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Оценка качества классификации
accuracy = accuracy_score(df_valid["label"], predictions)
precision = precision_score(df_valid["label"], predictions, average='weighted')  # 'weighted' для многоклассового случая
recall = recall_score(df_valid["label"], predictions, average='weighted')
f1 = f1_score(df_valid["label"], predictions, average='weighted')
conf_matrix = confusion_matrix(df_valid["label"], predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.33
Precision: 0.46
Recall: 0.33
F1 Score: 0.31
Confusion Matrix:
[[  7   1   5   0   2  15   0   3   1   0   0   0   0   0  15   4   0   2
    4  14]
 [  1  61   2   0   0   0   4   6  15   1   0   0   3   0  66  40   7   6
    0   2]
 [  6   1 220   0  13   7   5  14  20   1   2  19  61   7 354  57   3  21
    7  34]
 [  1   2   3   0   0   0   3   1   3   0   0   1   1   1  21  30   2   0
    0   8]
 [  0   0   0   0  89   0   0   6   0   0   0   0   0   0   1   0   0   0
    0   1]
 [  1   0   3   0   2 172   0  37   0   0   0   0   0   0  17   4   0   0
    2   4]
 [  1   0   1   0   0   1  70   0   0   0   0   3   3   0  37  19   1   0
    0  10]
 [  0   1   5   0   0  99   0  15   0   0   1   0   1   0  22   8   0   1
    0   7]
 [  0   0   0   0   0   0   0   1  19   0   0   0   1   0   2   5   1   0
    0   3]
 [  3   4  11   0   1   1   5   7  24   2   0   3  12   3 216  30   5   5
    1   3]
 [  0   0   0   0   0   0   0   0   1   1   1   0   0   0   2   4   0   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Вывод

Лучшей точности мы достигли с BERT с дообучением - 0.88