Часть 1

In [1]:
from base64 import decode
import glob
import pandas as pd
from os import listdir
from os.path import isfile, join
import re
import pymorphy2
from pymorphy2 import MorphAnalyzer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
from gensim.models import Word2Vec 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter, defaultdict
from gensim.models.doc2vec import *

import warnings
warnings.filterwarnings("ignore")

regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    return " ".join(regex.findall(str(text)))


def lemmatize(doc):
    #doc = re.sub(patterns, ' ', doc)
    #print('doc = ',doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_en:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            tokens.append(token)
    if len(tokens) > 2:
        return ' '.join(tokens).strip()
    return None


class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.popitem()[1])
        # self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(w2v.popitem()[1])
        # self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

class Doc2VecVectorizer(object):
    def __init__(self, d2v_model):
        self.d2v_model = d2v_model

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([self.d2v_model.infer_vector(text.split()) for text in X])


path = '/home/mikhail/it-academy/модуль 6/bbc/'
dfs = []
for i, dirname in enumerate(listdir(path) ):
    filenames = glob.glob(path + dirname + '/*.txt')
    for filename in filenames:
         text = open(filename, mode="rb").read()
         dfs.append((text, dirname))


df = pd.DataFrame(dfs, columns=['text', 'target'])

#regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")
stopwords_en = stopwords.words('english')
morph = MorphAnalyzer()
df.text = df.text.apply(words_only)
df.text = df.text.apply(lemmatize)
df.text = df.text.apply(lambda x: x.replace('\\n',' '))
df.text = df.text.apply(lambda x: x[1:])
df.text = df.text.apply(lambda x: x.replace('\\', ''))
#df.to_csv('/home/mikhail/it-academy/модуль 6/bbc/news.csv')

X = df.text.tolist()
y = df.target.tolist()
X, y = np.array(X), np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size=0.5)
X_val, X_inference, y_val, y_inference = train_test_split(X_val,y_val, test_size=0.5)

#*********************** MeanEmbeddingVectorizer and TfidfEmbeddingVectorizer **************************************************
texts = [str(df.text.iloc[i].split()) for i in range(len(df))]
word_to_vec = Word2Vec(texts, vector_size=100, window=5, min_count=1, workers=-1)
word_to_vec.save('./word2vec.pk')
w2v = dict(zip(word_to_vec.wv.index_to_key, word_to_vec.wv.vectors))
#*******************************************************************************************************************************

#***************** Doc2VecVectorizer *******************************************************************************************
splitted_texts = [text.split() for text in X]
idx = [str(i) for i in range(len(X))]
docs = []
# Чтобы модель поняла отношения документов по Id нам надо пронумеровать эти документы, это делается следующим образом:
for i in range(len(X)):
    docs.append(TaggedDocument(splitted_texts[i], [idx[i]]))  
model = Doc2Vec(vector_size=300, window=5, min_count=5, workers=8, alpha=0.025, min_alpha=0.01, dm=0)
# плюс в начале теперь нам надо построить словарь id докумеентов
model.build_vocab(docs)
model.train(docs, total_examples=len(docs), epochs=20)
#*******************************************************************************************************************************


rfc_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
                    ("extra trees", RandomForestClassifier(n_estimators=20))])

rfc_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
                          ("extra trees", RandomForestClassifier(n_estimators=20))])

rfc_d2v = Pipeline([
    ("word2vec vectorizer", Doc2VecVectorizer(model)),
    ("extra trees", RandomForestClassifier(n_estimators=20))])



rfc_w2v.fit(X_train,y_train)

pred = rfc_w2v.predict(X_test)
pred_val = rfc_w2v.predict(X_val)
pred_inference = rfc_w2v.predict(X_inference)

print('Test metrics for MeanEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))

print()
print('Val metrics for MeanEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_val, pred_val, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_val, pred_val, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_val, pred_val, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_val, pred_val)))

print()
print('Inference metrics for MeanEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_inference, pred_inference, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_inference, pred_inference, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_inference, pred_inference, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_inference, pred_inference)))


#print(classification_report(y_test, pred))

#abels = rfc_w2v.classes_

#sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
#plt.title("Confusion matrix")
#plt.show()


rfc_w2v_tfidf.fit(X_train,y_train)

pred = rfc_w2v_tfidf.predict(X_test)
pred_val = rfc_w2v_tfidf.predict(X_val)
pred_inference = rfc_w2v_tfidf.predict(X_inference)

print()
print('Test metrics for TfidfEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))

print()
print('Val metrics for TfidfEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_val, pred_val, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_val, pred_val, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_val, pred_val, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_val, pred_val)))

print()
print('Inference metrics for TfidfEmbeddingVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_inference, pred_inference, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_inference, pred_inference, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_inference, pred_inference, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_inference, pred_inference)))

#print(classification_report(y_test, pred))

#labels = rfc_w2v.classes_

#sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
#plt.title("Confusion matrix")
#plt.show()


rfc_d2v.fit(X_train,y_train)

pred = rfc_d2v.predict(X_test)
pred_val = rfc_d2v.predict(X_val)
pred_inference = rfc_d2v.predict(X_inference)

print()
print('Test metrics for Doc2VecVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_test, pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, pred)))

print()
print('Val metrics for Doc2VecVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_val, pred_val, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_val, pred_val, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_val, pred_val, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_val, pred_val)))

print()
print('Inference metrics for Doc2VecVectorizer:')
print("Precision: {0:6.2f}".format(precision_score(y_inference, pred_inference, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_inference, pred_inference, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_inference, pred_inference, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_inference, pred_inference)))

#print(classification_report(y_test, pred))
#labels = rfc_w2v.classes_



#sns.heatmap(data=confusion_matrix(y_test, pred), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
#plt.title("Confusion matrix")
#plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mikhail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Test metrics for MeanEmbeddingVectorizer:
Precision:   0.54
Recall:   0.52
F1-measure:   0.52
Accuracy:   0.53

Val metrics for MeanEmbeddingVectorizer:
Precision:   0.56
Recall:   0.55
F1-measure:   0.55
Accuracy:   0.59

Inference metrics for MeanEmbeddingVectorizer:
Precision:   0.52
Recall:   0.53
F1-measure:   0.52
Accuracy:   0.53

Test metrics for TfidfEmbeddingVectorizer:
Precision:   0.54
Recall:   0.53
F1-measure:   0.53
Accuracy:   0.54

Val metrics for TfidfEmbeddingVectorizer:
Precision:   0.55
Recall:   0.52
F1-measure:   0.53
Accuracy:   0.57

Inference metrics for TfidfEmbeddingVectorizer:
Precision:   0.56
Recall:   0.56
F1-measure:   0.55
Accuracy:   0.56

Test metrics for Doc2VecVectorizer:
Precision:   0.81
Recall:   0.79
F1-measure:   0.79
Accuracy:   0.80

Val metrics for Doc2VecVectorizer:
Precision:   0.80
Recall:   0.80
F1-measure:   0.78
Accuracy:   0.79

Inference metrics for Doc2VecVectorizer:
Precision:   0.83
Recall:   0.82
F1-measure:   0.82
Accuracy:   0

Часть 2  !!! На гитхабе есть версия ноутбука для гугл колаб, так как на моем компе не хватает ресурсов чтобы отладить этот код !!!

In [None]:
import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertTokenizer
import pandas as pd
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
from gensim.models import Word2Vec 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import torch


import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('/home/mikhail/it-academy/модуль 6/bbc/news.csv')
data=data.sample(frac=1.0, random_state=42)
#data = data.iloc[:500]


labels=data.target.unique()
NUM_LABELS= len(labels)
id2label={i:l for i,l in enumerate(labels)}
label2id={l:i for i,l in enumerate(labels)}
data["labels"]=data.target.map(lambda x: label2id[x])

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", max_length=512)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",  num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
#model.to('cpu')

SIZE= data.shape[0]

df_train, df_test, y_train, y_test = train_test_split(data.text, data.labels, test_size=0.3)
df_test, df_val, y_test, y_val = train_test_split(df_test, y_test, test_size=0.5)
df_val, df_inference, y_val, y_inference = train_test_split(df_val, y_val, test_size=0.5)

train_texts= list(df_train)
val_texts=   list(df_val)
test_texts=  list(df_test)
inference_texts=  list(df_inference)

train_labels= list(y_train)
val_labels=   list(y_val)
test_labels=  list(y_test)
inference_labels=  list(y_inference)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings  = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
inference_encodings = tokenizer(inference_texts, truncation=True, padding=True, return_tensors="pt")

class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
test_dataset = MyDataset(test_encodings, test_labels)
inference_dataset = MyDataset(inference_encodings, inference_labels)

training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='/content/outputs/',
    #  The number of epochs, defaults to 3.0
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    # Number of steps used for a linear warmup
    #warmup_steps=10,
    weight_decay=0.01
    #no_cuda=True
   # TensorBoard log directory
    #fp16=True
)

trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)

result = trainer.train()

q=[trainer.evaluate(eval_dataset=data) for data in [train_dataset, val_dataset, test_dataset, inference_dataset]]
print(pd.DataFrame(q, index=["train","val","test","inference"]).iloc[:,:5])


"""
def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs


inference_texts = [
('Inflation in Turkey has climbed above 83% - a 24-year-high. The transport, food and housing sectors have seen the biggest rise in prices. Independent experts the Inflation Research Group estimate the annual rate is actually 186.27%.',label2id['business']), #business
('The pound has climbed after the chancellor reversed his controversial decision to scrap the top rate of tax.Sterling gained more than 1% to $1.1284 before falling back slightly while government borrowing costs edged lower.Tory MPs had threatened to vote against Kwasi Kwarteng\'s plan, saying it was unfair when living costs were so high.',label2id['business']),    #business
('Australia coach Mal Meninga has named 13 uncapped players in his squad as they chase a third men\'s Rugby League World Cup title in a row.The Kangaroos beat England in the 2017 final but have only played four Tests since with their last match a shock loss to Tonga three years ago.Sydney Roosters full-back James Tedesco, who represented Italy at the last two World Cups, captains the side.',label2id['sport']),  #sport
('England will play India in the group stage of the 2023 Women\'s T20 World Cup in South Africa.The two sides have been placed into Group B alongside West Indies, Pakistan and Ireland.Defending champions Australia, New Zealand, hosts South Africa, Sri Lanka and Bangladesh make up Group A with the top two in each group progressing to the semi-finals.The tournament takes place between 10 and 26 February.',label2id['sport']), #sport
('During World War II, Spitfire pilots described their plane as so responsive it felt like an extension of their limbs.Fighter pilots of the 2030s, however, will have an even closer relationship with their fighter jet.It will read their minds.',label2id['tech']), #tech
('In deep, astonishingly clear, blue-lit ponds some 40m (130ft) beneath the Swedish countryside, lies decades worth of high-level nuclear waste.It is an oddly beautiful and rather disturbing sight. Row upon row of long metal containers, filled with used nuclear fuel from the country\'s reactors, lie below the surface near Oskarshamn, on Sweden\'s Baltic coast.It is both highly lethal and entirely safe.',label2id['tech']) #tech
]

df = pd.DataFrame(inference_texts, columns=['texts', 'labels'])

inference_acc = 0
n_iter = 0
for text, label in zip(df.texts, df.labels):
    probs = predict(text)
    #print(probs.argmax(), label, n_iter)
    inference_acc += (probs.argmax() == label).sum().item()
    n_iter += 1

print('Inference_acc: {}'.format(inference_acc / n_iter ))
"""

Часть 3. Вывод

Подход DistillBert работает намного лучше чем классическое машинное обучение с моделью на RandomForestClassifier. 
Метрика Acc 0.82 для классического машинного обучения c векторным представлением слов doc2vec.
Метрика Acc 0.95 для DistillBert.
Предположу, что это связано с тем, что DistillBert обучался на гораздо большем объеме данных чем у меня есть для данной задачи. И еще вдобавок  DistillBert дообучал на данных, что есть у меня. Но классическая модель обучилась гораздо быстрее чем тяжелый DistillBert (4 часа в гугл колабе), что привело к потере времени при отладке. Так же мне не хватило ресурсов компьютера чтобы дообучить DistillBert и пришлось использовать гугл колаб (пока бесплатно, но для больших датасетов нужно было бы платить за подписку, а это дополнительные расходы для компании). Поэтому если датасет не большой и для задачи нет необходимости высокой точности, то достаточно классического варианта. В противном случаи DistillBert.