# Imports

In [1]:
from typing import Any, Union
from pathlib import Path
import sys
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tqdm import tqdm
import torch
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
import sklearn
import transformers

repo_dir = Path(subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip())
data_dir = repo_dir / "data"
sys.path.append(str(repo_dir))
from src import bert, train, optim, engine

plt.style.use('Solarize_Light2')
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 150

%load_ext autoreload
%autoreload 2

# Data

In [2]:
data_fp = data_dir / "all-data.csv"
model_dir = repo_dir / "models/sentiment"
model_dir.mkdir(exist_ok=True)

data = pd.read_csv(
    data_fp,
    names=["label", "text"],
    encoding="cp866",
)
data

Unnamed: 0,label,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [3]:
text_words_amount = data["text"].str.split().apply(len)
data = data[text_words_amount >= 5]

# Train

In [4]:
data_sample = data

X = data_sample[["text"]]
y = data_sample["label"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Bert + boosting

In [25]:
bert_vectorizer = bert.BertVectorizer(
    bert_name="distilbert-base-uncased",
)
col_transformer = ColumnTransformer(
    [
        ("bert_vectorizer", bert_vectorizer, "text"),
    ]
)

model = LGBMClassifier(
    class_weight="balanced",
)

pipeline = Pipeline(
    [
        ("features", col_transformer),
        ("model", model),
    ]
)

metrics_dict = train.train_and_cross_validate(pipeline, X, y, split_type="kfold")

Bert inference on 3224 texts: 100%|██████████| 33/33 [00:04<00:00,  6.74it/s]
Bert inference on 1612 texts: 100%|██████████| 17/17 [00:01<00:00, 16.35it/s]
Bert inference on 3224 texts: 100%|██████████| 33/33 [00:00<00:00, 237.27it/s]
Bert inference on 1612 texts: 100%|██████████| 17/17 [00:00<00:00, 1115.19it/s]
Bert inference on 3224 texts: 100%|██████████| 33/33 [00:01<00:00, 29.30it/s] 
Bert inference on 1612 texts: 100%|██████████| 17/17 [00:00<00:00, 17.38it/s]


defaultdict(list,
            {'acc': [0.565136476426799,
              0.7779156327543424,
              0.5942928039702233],
             'f1': [0.45035519426846204,
              0.5929626383428066,
              0.38225300031016324]})

In [26]:
np.mean(metrics_dict["f1"])

0.4751902776404773

In [77]:
joblib.dump(pipeline, model_dir / "model.pkl")

['/home/natitov/workflow/hse-financial-tonality/models/tonality/model.pkl']

In [78]:
from sklearn.metrics import confusion_matrix

pipeline = joblib.load(model_dir / "model.pkl")
pred_test = pipeline.predict(X_test)

Bert inference on 485 texts:   0%|          | 0/5 [00:00<?, ?it/s]

In [27]:
# confusion_matrix = sklearn.metrics.confusion_matrix(y_test, pred_test)

# confusion_matrix_display = sklearn.metrics.ConfusionMatrixDisplay(
#     confusion_matrix,
#     display_labels=pipeline.named_steps['model']._classes,
# ).plot()

# confusion_matrix_display.ax_.grid(False);

## Rule-based

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


class VaderPredictor(TransformerMixin, BaseEstimator):
    def __init__(self) -> None:
        super().__init__()
        self.vader = SentimentIntensityAnalyzer()

    def fit(self, X, y=None):
        return self

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        output = np.full(
            X.shape[0],
            "",
            dtype="object",
        )
        for i, sentence in enumerate(X.values.flatten()):
            output[i] = self.get_sentiment(sentence)

        return output

    def get_sentiment(self, sentence):
        sentiment_dict = self.vader.polarity_scores(sentence)

        if sentiment_dict["compound"] >= 0.05:
            overall_sentiment = "positive"
        elif sentiment_dict["compound"] <= -0.05:
            overall_sentiment = "negative"
        else:
            overall_sentiment = "neutral"

        return overall_sentiment

In [20]:
pipeline = VaderPredictor()

metrics_dict = train.train_and_cross_validate(pipeline, X, y, split_type="kfold")
{metric_type: round(np.mean(metrics), 2) for metric_type, metrics in metrics_dict.items()}

{'acc': 0.5430107526881721, 'f1': 0.4308794343578661}

## Bert finetuning

In [16]:
from focal_loss import FocalLoss

device = "cuda"

model = bert.BertFineTuned(use_softmax_in_forward=True)
# criterion = torch.nn.CrossEntropyLoss()

label_rates = data["label"].value_counts() / data.shape[0]
label_rates = label_rates.sort_index()
weights = [1 / rate for rate in label_rates]

criterion = FocalLoss(
    gamma=1,
    weights=torch.tensor(weights, device=device),
)

milestones = [15]
lr = 0.005

epochs = 30

In [17]:
metrics_dict = bert.train_bert_and_cross_validate(
    model,
    criterion=criterion,
    lr=lr,
    milestones=milestones,
    epochs=epochs,
    X=X,
    y=y,
    device=device,
    split_type="kfold",
)

Training::   0%|          | 0/30 [00:00<?, ?it/s]

acc: 0.65
f1: 0.54


Training::   0%|          | 0/30 [00:00<?, ?it/s]

acc: 0.70
f1: 0.58


Training::   0%|          | 0/30 [00:00<?, ?it/s]

acc: 0.60
f1: 0.50


In [9]:
{metric_type: round(np.mean(metrics), 2) for metric_type, metrics in metrics_dict.items()}

{'acc': 0.69, 'f1': 0.55}