In [1]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, T5ForConditionalGeneration
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from time import time
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, precision_recall_fscore_support
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from datasets import Dataset
import pyarrow as pa
from statistics import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
import itertools
import _pickle as cPickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
device = torch.device('cuda')

In [3]:
def train_cv(model, x_train, y_train):

    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    scores = []

    x_skf = np.array(x_train)
    y_skf = np.array(y_train)
    for train_index, val_index in skf.split(x_skf, y_skf):
        xfold_train, xfold_val = x_skf[train_index], x_skf[val_index]
        yfold_train, yfold_val = y_skf[train_index], y_skf[val_index]

        model.fit(xfold_train, yfold_train)

        predictions_val =  model.predict(xfold_val)
        # predict_proba.append(model.predict_proba(xfold_val))

        score = f1_score(yfold_val, predictions_val, average='macro')
        print(classification_report(yfold_val, predictions_val, output_dict=True))
        scores.append(score)

    print("Mean  F1 =", round(mean(scores),4))
    print("Max   F1 =", round(max(scores),4))
    print("Min   F1 =", round(min(scores),4))
    print("Std  dev =", round(np.std(scores),4))

    return scores #, predict_proba

In [4]:
def train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    predictions_test =  model.predict(x_test)
    report = classification_report(y_test, predictions_test, output_dict=True)
    print(report)
    print(report['macro avg']['f1-score'])

# Классификация векторов BERT

## Загрузка модели и данных

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.4.0-py3-none-any.whl (149 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/149.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.4.0


In [None]:
from transformers import AutoModel
from sentence_transformers import SentenceTransformer
num_authors = 26
model_name = 'intfloat/multilingual-e5-large'
#model_name = '/content/drive/MyDrive/classifier_author_texts_18'
#tokenizer = BertTokenizer.from_pretrained(model_name+'/tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModel.from_pretrained(model_name, output_hidden_states = True, return_dict=True)
model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [5]:
path = 'drive/MyDrive/Colab Notebooks/Классификация авторов/VectorClassification/datasets(texts)/'
train_file = path + 'train_authors_texts.csv'
df_train = pd.read_csv(train_file, encoding="utf-8", sep="\t")
val_file = path + 'val_authors_texts.csv'
df_val = pd.read_csv(val_file, encoding="utf-8", sep="\t")
test_file = path + 'test_authors_texts.csv'
df_test = pd.read_csv(test_file, encoding="utf-8", sep="\t")

In [10]:
labels = 26
def add_multi_label(df):
  labels_matrix = np.zeros((len(df), labels))
  for i, label in enumerate(list(df['label'])):
    labels_matrix[i][label] = 1
  df['labels'] = labels_matrix.tolist()
  return df['labels']

In [11]:
df_train['labels'] = add_multi_label(df_train)
df_val['labels'] = add_multi_label(df_val)
df_test['labels'] = add_multi_label(df_test)

In [12]:
df_train

Unnamed: 0,text,author,lang,label,labels
0,Nach dem Terror-Angriff der Hamas auf Israel i...,de_neutral,de,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1,На так званій росії генерал-лейтенанта Мурадов...,sternenko,uk,22,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,وقال المتحدث الرسمي باسم الحكومة الأردنية، مهن...,ar_neutral,ar,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Polémica La millonaria cifra que el príncipe A...,es_neutral,es,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,أظهر مقطع فيديو لحظة بكاء طفلة فلسطينية من قطا...,ar_neutral,ar,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
119639,"E3 отменят? VR от Blizzard, новые Prince of P...",puchkov,ru,19,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
119640,Shares of Bharat Forge Ltd. traded 0.7 per cen...,en_neutral,en,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
119641,Aquí podrás encontrar la farmacia de guardia m...,es_neutral,es,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
119642,La cosecha de medallas fue muy buena. Los tres...,es_neutral,es,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
df_val

Unnamed: 0,text,author,lang,label,labels
0,İstanbul Maltepe Belediyesi'nin büyüme çağında...,tr_neutral,tr,23,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Gregg Wallace has revealed the 'troubling' rea...,en_neutral,en,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,"Alle Mitarbeiter in Sean Shermans Restaurant ""...",de_neutral,de,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,История моих предков. Часть I. Происхождение ф...,lebedev,ru,15,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,C'est devenu sa marque de fabrique. Raconter l...,fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
14766,The Philadelphia 76ers are basking in the afte...,en_neutral,en,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
14767,"Pendant au moins trois ans, de 2019 à 2022, un...",fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
14768,La FIFA et la Confédération sud-américaine de ...,fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
14769,Le grand ballet diplomatique français donne pa...,fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
df_test

Unnamed: 0,text,author,lang,label,labels
0,صناعة الذرائع في لوم الخرطوم ... تنصل المثقف ا...,ar_neutral,ar,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Ed West is a prominent British conservative jo...,dreher,en,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,Как мне жаль Украину! Жаль до слёз эту прекрас...,prohanov,ru,18,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Politico опубликовало традиционный список из 2...,ru_neutral,ru,20,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Український телеведучий Анатолій Анатолич відк...,uk_neutral,uk,24,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
13289,Die Wirtschaftslage in Russland scheint angesi...,de_neutral,de,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
13290,Ce qui ne vous tue pas vous rend plus fort. Te...,fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13291,Россия на «Суде мирового презрения».\nПредседа...,puchkov,ru,19,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13292,La loi immigration longtemps repoussée déboule...,fr_neutral,fr,11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
for author in list(df_test.author.unique()):
  print(f"{author} --- {df_test[df_test['author'] == author].iloc[0]['label']}")

ar_neutral --- 1
dreher --- 7
prohanov --- 18
ru_neutral --- 20
uk_neutral --- 24
fr_neutral --- 11
sternenko --- 22
wirth --- 25
tr_neutral --- 23
melenchon --- 16
en_neutral --- 8
es_neutral --- 9
de_neutral --- 5
getmantsev --- 12
calislar --- 4
lebedev --- 15
moulitsas --- 17
puchkov --- 19
fernandez --- 10
aydin --- 2
hadi --- 14
h16 --- 13
alsalabi --- 0
slavic --- 21
beckedahl --- 3
dominguez --- 6


## Получение представления текстов (векторов BERT)

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def get_embeddings(df):
    bert_vectors = []
    texts = []
    embeddings = []
    count = 0
    for text in df['text']:
        count += 1
        embedding = model.encode(text, normalize_embeddings=True)
        embeddings.append(embedding)
        print(f'Get {count} vectors')
    return embeddings

In [None]:
def save_vectors(df, embeddings, filename):
    df_vectors = pd.DataFrame()
    for i in list(range(len(embeddings[0]))):
        df_vectors[i] = np.array(embeddings)[:, i]
    df_vectors['label'] = df['label']
    df_vectors.to_csv(filename, encoding="utf-8", sep="\t", index=False)
    print(f"{len(df_vectors)} save to {filename}")

In [None]:
bert_vectors_train = get_embeddings(df_train)
save_vectors(df_train, bert_vectors_train, path+'train_vectors.csv')

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Get 2276 vectors
Get 2277 vectors
Get 2278 vectors
Get 2279 vectors
Get 2280 vectors
Get 2281 vectors
Get 2282 vectors
Get 2283 vectors
Get 2284 vectors
Get 2285 vectors
Get 2286 vectors
Get 2287 vectors
Get 2288 vectors
Get 2289 vectors
Get 2290 vectors
Get 2291 vectors
Get 2292 vectors
Get 2293 vectors
Get 2294 vectors
Get 2295 vectors
Get 2296 vectors
Get 2297 vectors
Get 2298 vectors
Get 2299 vectors
Get 2300 vectors
Get 2301 vectors
Get 2302 vectors
Get 2303 vectors
Get 2304 vectors
Get 2305 vectors
Get 2306 vectors
Get 2307 vectors
Get 2308 vectors
Get 2309 vectors
Get 2310 vectors
Get 2311 vectors
Get 2312 vectors
Get 2313 vectors
Get 2314 vectors
Get 2315 vectors
Get 2316 vectors
Get 2317 vectors
Get 2318 vectors
Get 2319 vectors
Get 2320 vectors
Get 2321 vectors
Get 2322 vectors
Get 2323 vectors
Get 2324 vectors
Get 2325 vectors
Get 2326 vectors
Get 2327 vectors
Get 2328 vectors
Get 2329 vectors


KeyboardInterrupt: 

In [None]:
bert_vectors_val = get_embeddings(df_val)
save_vectors(df_val, bert_vectors_val, path+'val_vectors.csv')

In [None]:
bert_vectors_test = get_embeddings(df_test)
save_vectors(df_test, bert_vectors_test, path+'test_vectors.csv')

## Загрузка векторов

In [17]:
import pandas as pd
#path = 'turkish_authors/e5_vectors/'
path = 'drive/MyDrive/Colab Notebooks/Классификация авторов/VectorClassification/datasets(texts)/'
df_train = pd.read_csv(path+"train_vectors.csv", encoding="utf-8", sep="\t")
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.049548,-0.020400,-0.027529,-0.016268,0.020281,-0.013880,-0.024885,0.069601,0.074783,-0.012459,...,-0.009210,0.007953,-0.033918,0.001218,0.001557,0.036626,-0.034014,-0.038638,0.042543,5
1,0.014352,-0.023455,-0.045007,-0.046314,0.020391,-0.038322,-0.012129,0.081324,0.051771,-0.017749,...,-0.009121,-0.004058,0.025236,-0.004582,-0.013741,0.003851,0.000233,0.015133,0.003553,22
2,0.030150,-0.010687,-0.027037,-0.053753,0.042916,-0.005888,-0.002136,0.067005,0.048112,-0.030644,...,-0.027848,0.005278,-0.009729,-0.017662,0.015032,-0.000268,-0.019518,-0.043615,0.018826,1
3,0.028565,-0.034172,-0.047654,-0.043606,0.032001,-0.008881,-0.028124,0.076012,0.053898,-0.036697,...,-0.044766,0.004748,0.007245,-0.024546,0.040605,0.008426,0.027554,-0.042575,0.009297,9
4,0.010649,0.006392,-0.006769,-0.025186,0.019954,-0.015835,0.008966,0.095919,0.047048,-0.017471,...,-0.024249,0.026391,-0.015686,0.014521,0.032558,0.001497,-0.029999,-0.053983,-0.018113,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119639,-0.015078,-0.012648,-0.009050,-0.029399,0.009954,-0.013806,-0.003557,0.111984,0.050541,-0.032721,...,-0.028016,0.015218,-0.005382,-0.019711,0.012766,0.023267,-0.013829,-0.040472,-0.007936,19
119640,0.042109,0.017255,-0.024393,-0.008109,0.004987,-0.038743,0.000594,0.093887,0.065891,-0.013828,...,-0.003819,0.016940,-0.019426,0.007284,-0.008806,0.023666,-0.025524,-0.046690,0.028556,8
119641,0.011157,-0.014598,-0.015736,-0.064569,0.022116,-0.035552,-0.021101,0.094191,0.029341,-0.020253,...,0.007973,0.035960,-0.005713,-0.017259,0.037231,0.018802,-0.021838,-0.022682,-0.022317,9
119642,0.041051,-0.007936,-0.033900,-0.026776,0.031053,-0.024563,-0.023942,0.090796,0.036256,-0.032763,...,-0.011615,0.024615,-0.014983,-0.006754,0.012344,0.022317,0.013367,-0.044166,-0.004096,9


In [18]:
df_val = pd.read_csv(path + "val_vectors.csv",encoding="utf-8", sep="\t")
df_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.036151,-0.030085,0.016784,-0.031273,0.015556,-0.028578,-0.009960,0.055383,0.034544,-0.040616,...,-0.013368,0.005141,-0.057476,0.011565,0.017015,0.009020,0.004610,-0.047970,0.004311,23
1,0.016543,-0.016406,0.007426,-0.017977,0.040241,-0.017388,0.002207,0.078832,0.050977,-0.021896,...,-0.049953,0.005820,-0.033784,-0.015385,0.028504,-0.006765,-0.033924,-0.022050,-0.002776,8
2,0.013766,-0.020152,-0.020419,-0.042287,0.034894,-0.026104,0.009226,0.096106,0.044564,-0.039826,...,-0.021454,0.013812,-0.024058,0.000385,0.044764,0.017694,-0.019175,-0.020463,0.031551,5
3,0.018962,0.015989,-0.051553,-0.006473,0.044216,-0.021238,0.005617,0.110099,0.053895,-0.020654,...,-0.017837,-0.003442,0.005966,-0.013263,0.020226,-0.001412,-0.022851,-0.035692,0.018188,15
4,0.024092,-0.006889,-0.001940,-0.038384,0.028672,-0.039093,-0.006896,0.044859,0.055177,-0.021256,...,-0.017233,0.004330,0.024712,-0.012178,-0.009761,0.020166,-0.015720,-0.046073,0.007423,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14766,0.038551,-0.033859,-0.004036,-0.023709,0.036776,-0.036333,0.009331,0.081319,0.045602,-0.015489,...,-0.020106,0.023846,-0.020222,-0.003679,0.021492,0.032370,-0.029865,-0.050959,0.025017,8
14767,0.011124,0.001766,-0.030560,-0.041406,0.031346,-0.022633,-0.016616,0.031889,0.078649,-0.010348,...,-0.008196,0.021390,0.000362,-0.020317,-0.013777,-0.003429,-0.007758,-0.026423,-0.018705,11
14768,0.033099,0.012896,-0.011967,-0.026890,0.027344,-0.028306,-0.003025,0.057339,0.063550,-0.037263,...,-0.017526,0.026654,0.006329,0.006977,0.004146,0.005234,-0.005193,0.005416,-0.015434,11
14769,0.040037,-0.012550,0.004039,-0.050461,0.030228,-0.015041,0.006961,0.070151,0.059241,-0.015457,...,-0.012273,-0.000701,0.010288,-0.002324,0.001777,0.004264,-0.024936,-0.049297,0.003382,11


In [19]:
df_test = pd.read_csv(path + "test_vectors.csv",encoding="utf-8", sep="\t")
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.024970,-0.013189,-0.002762,-0.047500,0.043571,0.001058,-0.041063,0.072094,0.033540,-0.017108,...,-0.041269,0.003985,-0.006302,0.005119,0.003856,0.011106,-0.008088,-0.051160,0.017796,1
1,0.019129,-0.032067,-0.026443,-0.014677,0.026069,-0.036322,-0.016152,0.085582,0.060161,-0.053815,...,-0.009879,0.014071,0.016639,0.003668,0.013067,-0.001339,0.010888,-0.046496,-0.010419,7
2,0.030154,-0.017756,-0.030906,-0.041982,0.044054,-0.039513,-0.001940,0.101917,0.054583,-0.036269,...,0.000063,-0.015189,0.034607,-0.004162,0.020366,0.007524,-0.021096,-0.046749,0.006795,18
3,0.025446,-0.014192,0.000621,-0.046653,0.030688,-0.006960,-0.025989,0.120119,0.047996,-0.007980,...,-0.026995,-0.004240,-0.035749,-0.023387,0.005120,0.038267,0.033435,-0.044209,-0.020609,20
4,0.032850,-0.021648,-0.016661,-0.026936,0.020951,-0.020157,0.018011,0.093798,0.012013,-0.026423,...,0.000309,0.036714,0.004230,-0.024566,0.020971,0.007264,-0.045502,-0.017684,0.029045,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13289,0.042521,-0.004460,-0.032630,-0.069768,0.050485,-0.002357,-0.004254,0.103064,0.054196,-0.020777,...,0.006365,-0.026848,-0.013830,-0.001211,0.017119,0.031995,0.007703,-0.026189,0.002695,5
13290,0.005562,-0.010892,-0.017003,-0.040933,0.025946,-0.034321,-0.021301,0.075307,0.041664,-0.042644,...,-0.017866,0.003498,0.009371,-0.003990,0.015289,-0.021018,0.005779,-0.018020,0.005093,11
13291,-0.009999,-0.031345,-0.024693,-0.040379,0.016567,-0.029328,-0.018781,0.094340,0.050961,-0.019804,...,-0.030263,0.006378,-0.016450,-0.026167,0.021211,0.005930,0.008648,-0.037354,0.014597,19
13292,0.010538,-0.000301,-0.024017,-0.032304,0.020829,-0.006812,-0.001027,0.069674,0.047510,0.003435,...,-0.031110,0.008995,-0.007827,0.009041,0.015657,0.001099,-0.024447,-0.061848,-0.014628,11


In [24]:
df_train['labels'] = add_multi_label(df_train)
df_val['labels'] = add_multi_label(df_val)
df_test['labels'] = add_multi_label(df_test)

In [45]:
x_train = df_train.drop(columns=['label', 'labels'])
y_train = np.argmax(y_train, axis=1)

In [21]:
x_val = df_val.drop(columns=['label'])
y_val = df_val['label']

In [22]:
x_test = df_test.drop(columns=['label'])
y_test = df_test['label']

## Обучение и оценка моделей на валидационных данных. Подбор гиперпараметров

In [None]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [None, 5]
for param in  itertools.product(n_estimators, max_depth):
    print(f"n_estimators = {param[0]}, max_depth = {param[1]}")
    model = RandomForestClassifier(random_state=42, n_estimators = param[0], max_depth = param[1])
    train_cv(model, x_train, y_train)

n_estimators = 100, max_depth = None
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 23929}, '

In [None]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [None, 5]
for param in  itertools.product(n_estimators, max_depth):
    print(f"n_estimators = {param[0]}, max_depth = {param[1]}")
    model = RandomForestClassifier(random_state=42, n_estimators = param[0], max_depth = param[1])
    train_cv(model, x_train, y_train)

In [43]:
from sklearn.svm import SVC
C = [0.1, 1, 10, 100]
kernels = ['poly', 'rbf', 'sigmoid']
for param in  itertools.product(kernels, C):
    print(f"kernel = {param[0]}, C = {param[1]}")
    model = SVC(random_state=42, kernel=param[0], C = param[1])
    train_cv(model, x_train, y_train)

kernel = poly, C = 0.1


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

In [44]:
from sklearn.linear_model import LogisticRegression
C = [0.1, 1, 10, 100, 1000]
for param in  itertools.product(C):
    print(f"C = {param[0]}")
    model = LogisticRegression(random_state=42, C = param[0], max_iter=5000)
    train_cv(model, x_train, y_train)

C = 0.1


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

In [40]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [3 + i for i in range(2, 26, 2)]
for param in  itertools.product(n_neighbors):
    print(f"n_neighbors = {param[0]}")
    model = KNeighborsClassifier(n_neighbors=param[0])
    train_cv(model, x_train, y_train)

n_neighbors = 5


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

## Обучение итоговой модели и оценка на тестовых данных

Оценка BERT: mean F1 = 0,7426

Оценка Random Forest: mean F1 = 0,7631

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

Данило Гетманцев --- 12 \\
Rod Dreher --- 9 \\
Abel Fernández --- 0 \\
Сергій Стерненко --- 14  \\
Артемий Лебедев --- 11 \\
Jean-Luc Mélenchon --- 4 \\
Александр Проханов --- 10 \\
José García Domínguez --- 5 \\
علي الصلابي --- 16 \\
Akın Aydın --- 1 \\
Gerhard Wirth --- 2 \\
Markos Moulitsas --- 6 \\
Дмитрий Пучков --- 13 \\
Oral Çalışlar --- 8 \\
H16 --- 3 \\
Старославянский --- 15 \\
Markus Beckedahl --- 7 \\
محمود عبد الهادي --- 17

In [None]:
SVC_clf = SVC(random_state=42, C = 10, kernel = 'poly')
train(SVC_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 91}, '1': {'precision': 1.0, 'recall': 0.989010989010989, 'f1-score': 0.994475138121547, 'support': 91}, '2': {'precision': 1.0, 'recall': 0.978021978021978, 'f1-score': 0.9888888888888888, 'support': 91}, '3': {'precision': 0.9888888888888889, 'recall': 1.0, 'f1-score': 0.9944134078212291, 'support': 89}, '4': {'precision': 1.0, 'recall': 0.9866666666666667, 'f1-score': 0.9932885906040269, 'support': 75}, '5': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86}, '6': {'precision': 0.987012987012987, 'recall': 0.9743589743589743, 'f1-score': 0.9806451612903225, 'support': 78}, '7': {'precision': 0.9411764705882353, 'recall': 1.0, 'f1-score': 0.9696969696969697, 'support': 32}, '8': {'precision': 0.9891304347826086, 'recall': 1.0, 'f1-score': 0.994535519125683, 'support': 91}, '9': {'precision': 0.9782608695652174, 'recall': 0.989010989010989, 'f1-score': 0.9836065573770493, 'support': 91}, '10': {'precisi

In [None]:
LR_clf = LogisticRegression(random_state=42, C = 100, penalty = 'l2', max_iter = 5000)
train(LR_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 91}, '1': {'precision': 1.0, 'recall': 0.989010989010989, 'f1-score': 0.994475138121547, 'support': 91}, '2': {'precision': 0.9891304347826086, 'recall': 1.0, 'f1-score': 0.994535519125683, 'support': 91}, '3': {'precision': 0.9888888888888889, 'recall': 1.0, 'f1-score': 0.9944134078212291, 'support': 89}, '4': {'precision': 1.0, 'recall': 0.9866666666666667, 'f1-score': 0.9932885906040269, 'support': 75}, '5': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86}, '6': {'precision': 0.987012987012987, 'recall': 0.9743589743589743, 'f1-score': 0.9806451612903225, 'support': 78}, '7': {'precision': 1.0, 'recall': 0.96875, 'f1-score': 0.9841269841269841, 'support': 32}, '8': {'precision': 0.9891304347826086, 'recall': 1.0, 'f1-score': 0.994535519125683, 'support': 91}, '9': {'precision': 0.9782608695652174, 'recall': 0.989010989010989, 'f1-score': 0.9836065573770493, 'support': 91}, '10': {'precision': 1.0, '

In [None]:
RF_clf = RandomForestClassifier(random_state=42, n_estimators = 500, max_depth = None)
train(RF_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 1.0, 'recall': 0.978021978021978, 'f1-score': 0.9888888888888888, 'support': 91}, '1': {'precision': 0.9888888888888889, 'recall': 0.978021978021978, 'f1-score': 0.9834254143646408, 'support': 91}, '2': {'precision': 0.900990099009901, 'recall': 1.0, 'f1-score': 0.9479166666666667, 'support': 91}, '3': {'precision': 0.8969072164948454, 'recall': 0.9775280898876404, 'f1-score': 0.935483870967742, 'support': 89}, '4': {'precision': 0.9701492537313433, 'recall': 0.8666666666666667, 'f1-score': 0.915492957746479, 'support': 75}, '5': {'precision': 0.9772727272727273, 'recall': 1.0, 'f1-score': 0.9885057471264368, 'support': 86}, '6': {'precision': 0.9487179487179487, 'recall': 0.9487179487179487, 'f1-score': 0.9487179487179487, 'support': 78}, '7': {'precision': 1.0, 'recall': 0.6875, 'f1-score': 0.8148148148148148, 'support': 32}, '8': {'precision': 0.9782608695652174, 'recall': 0.989010989010989, 'f1-score': 0.9836065573770493, 'support': 91}, '9': {'precision': 0.956

In [None]:
KNN_clf = KNeighborsClassifier(n_neighbors=5)
train(KNN_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.9891304347826086, 'recall': 1.0, 'f1-score': 0.994535519125683, 'support': 91}, '1': {'precision': 0.967741935483871, 'recall': 0.989010989010989, 'f1-score': 0.9782608695652174, 'support': 91}, '2': {'precision': 0.989010989010989, 'recall': 0.989010989010989, 'f1-score': 0.989010989010989, 'support': 91}, '3': {'precision': 0.9560439560439561, 'recall': 0.9775280898876404, 'f1-score': 0.9666666666666667, 'support': 89}, '4': {'precision': 0.9726027397260274, 'recall': 0.9466666666666667, 'f1-score': 0.9594594594594594, 'support': 75}, '5': {'precision': 1.0, 'recall': 0.9883720930232558, 'f1-score': 0.9941520467836257, 'support': 86}, '6': {'precision': 0.9176470588235294, 'recall': 1.0, 'f1-score': 0.9570552147239264, 'support': 78}, '7': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32}, '8': {'precision': 0.9887640449438202, 'recall': 0.967032967032967, 'f1-score': 0.9777777777777779, 'support': 91}, '9': {'precision': 1.0, 'recall

### Сохранение модели

In [None]:
path_model = path + 'SVC_clf.pkl'
with open(path_model, 'wb') as f:
    cPickle.dump(SVC_clf, f)

In [None]:
path_model = path + 'LR_clf.pkl'
with open(path_model, 'wb') as f:
    cPickle.dump(LR_clf, f)

In [None]:
path_model = path + 'RF_clf.pkl'
with open(path_model, 'wb') as f:
    cPickle.dump(RF_clf, f)

Загрузка модели

In [None]:
path_model = path + 'LR_clf.pkl'

In [None]:
with open(path_model, 'rb') as f:
    rf = cPickle.load(f)

In [None]:
predictions_test =  rf.predict(x_test)
report = classification_report(y_test, predictions_test, output_dict=True)
print(report)
print(report['macro avg']['f1-score'])

ValueError: ignored

Оценка HistGradientBoostingClassifier: mean F1 = 0,7624

In [None]:
# очень долго
GB_clf = GradientBoostingClassifier(random_state=42)#, n_estimators= , max_depth= , learning_rate=
train(GB_clf, x_train, y_train, x_test, y_test)

KeyboardInterrupt: ignored

In [None]:
# This implementation is inspired by LightGBM [https://papers.nips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf]
HGB_clf = HistGradientBoostingClassifier()
train(HGB_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.8071065989847716, 'recall': 0.7718446601941747, 'f1-score': 0.7890818858560793, 'support': 206}, '1': {'precision': 0.8229665071770335, 'recall': 0.8349514563106796, 'f1-score': 0.8289156626506025, 'support': 206}, '2': {'precision': 0.61, 'recall': 0.5951219512195122, 'f1-score': 0.6024691358024692, 'support': 205}, '3': {'precision': 0.751219512195122, 'recall': 0.751219512195122, 'f1-score': 0.751219512195122, 'support': 205}, '4': {'precision': 0.54, 'recall': 0.526829268292683, 'f1-score': 0.5333333333333333, 'support': 205}, '5': {'precision': 0.8702702702702703, 'recall': 0.7815533980582524, 'f1-score': 0.823529411764706, 'support': 206}, '6': {'precision': 0.7129629629629629, 'recall': 0.7475728155339806, 'f1-score': 0.7298578199052131, 'support': 206}, '7': {'precision': 0.75, 'recall': 0.6844660194174758, 'f1-score': 0.7157360406091372, 'support': 206}, '8': {'precision': 0.9333333333333333, 'recall': 0.883495145631068, 'f1-score': 0.9077306733167083, 's

# Классификация по статистическим признакам

## Загрузка данных

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/author_texts/'
train_file = path + 'data_train.csv'
df_train = pd.read_csv(train_file, encoding="utf-8", sep=",")
test_file = path + 'data_test.csv'
df_test = pd.read_csv(test_file, encoding="utf-8", sep=",")

FileNotFoundError: ignored

In [None]:
df_train

Unnamed: 0,label,author,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,PUNCT,SCONJ,SYM,VERB,X,median_lw,mean_lw,h_min,h_median,h_mean
0,14,Юрий Подоляка,0.479371,-0.185367,-0.531557,-0.591598,0.012160,-0.293529,-0.154495,0.318198,...,0.521406,1.830605,-0.137011,-0.642084,-0.081813,-0.063381,-0.335791,-0.230487,-0.947507,-1.187037
1,13,Татьяна Андрющенко,0.486460,0.339884,0.938771,-0.591598,-0.236847,-0.426040,-0.154495,0.281090,...,-1.211950,-0.966596,-0.137011,-0.274048,-0.081813,0.933708,1.606128,-0.230487,-1.181633,-1.069132
2,6,Илья Варламов,0.322112,0.143492,1.883515,2.450471,-1.506780,0.771974,-0.154495,0.485395,...,-0.368617,-0.966596,-0.137011,-0.799078,-0.081813,-0.063381,-0.255747,0.187281,-0.463471,0.471269
3,7,Лимонов,-0.212568,0.383090,-0.677791,-0.591598,1.075418,3.020554,-0.154495,-0.389173,...,-1.175570,-0.966596,-0.137011,0.868986,-0.081813,-0.063381,0.016404,-0.230487,-0.568914,-0.257906
4,16,Яна Франк,-0.187646,-1.359068,1.864363,-0.591598,1.119184,-0.403131,-0.154495,-1.082156,...,1.118777,-0.160623,3.375209,-0.188397,-0.081813,-1.060471,-1.032483,0.086921,0.093118,0.286162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35344,6,Илья Варламов,0.322112,-0.455501,0.188534,-0.591598,-0.626486,0.771974,-0.154495,0.157432,...,-0.368617,1.194878,-0.137011,1.202599,-0.081813,0.435164,0.108741,-0.230487,-0.814474,0.073594
35345,11,Прилепин,-1.258793,0.788562,0.405839,-0.591598,-1.506780,-1.101842,-0.154495,0.258344,...,-0.194035,0.862343,-0.137011,1.856994,-0.081813,-0.063381,0.333041,-0.230487,-1.182993,-0.437117
35346,10,Пелевин,0.040152,0.630174,0.047285,-0.591598,0.914030,-0.457718,-0.154495,0.813358,...,-1.054212,-0.223589,-0.137011,0.983666,-0.081813,0.933708,0.764818,-0.230487,-1.103914,-0.402584
35347,16,Яна Франк,-1.682938,-1.679532,1.195697,-0.591598,0.177262,-1.101842,-0.154495,-2.166826,...,2.394323,2.134649,-0.137011,1.985864,-0.081813,-1.060471,-1.710528,-0.230487,0.319858,-0.013375


In [None]:
df_test

Unnamed: 0,label,author,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,PUNCT,SCONJ,SYM,VERB,X,median_lw,mean_lw,h_min,h_median,h_mean
0,8,Максим Кац,-0.152128,-0.756017,1.256378,0.354723,1.155345,0.629855,-0.147618,0.393432,...,-0.379191,-0.346628,-0.123701,-0.055468,-0.080712,-0.065412,-0.130561,0.106149,-0.163187,0.759608
1,9,Маринина,-1.661324,-1.115637,-1.496690,-0.587854,-1.502438,-1.110248,-0.147618,-0.399241,...,2.935716,-1.012122,-0.123701,0.840275,-0.080712,-2.045342,-1.898980,0.428161,2.022688,3.011519
2,11,Прилепин,-0.517512,1.893817,0.459437,-0.587854,-0.495278,-0.011235,-0.147618,0.226553,...,-1.214149,0.248813,-0.123701,0.651698,-0.080712,-0.065412,-0.377209,-0.127997,-0.523297,-0.573711
3,3,Верола,-0.601206,2.226200,-0.288026,-0.587854,-0.568973,-0.091651,-0.147618,2.442783,...,-0.546669,-1.012122,-0.123701,-1.169686,-0.080712,-0.065412,-0.216825,-0.127997,-1.009234,-2.056520
4,10,Пелевин,0.180407,-0.000205,-0.656771,-0.587854,1.092279,0.305429,-0.147618,0.366562,...,-0.856762,0.612133,-0.123701,0.901004,-0.080712,-0.065412,-0.217483,-0.127997,-0.637274,-0.373389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3491,9,Маринина,0.120023,-0.072149,0.534262,1.637246,-0.247616,-1.110248,-0.147618,-0.906033,...,0.175881,2.915380,-0.123701,-0.275733,-0.080712,-0.065412,-0.211734,-0.127997,0.416321,0.104397
3492,2,Борис Рожин,-0.492914,1.146490,-0.697412,-0.587854,0.143673,-0.661189,-0.147618,0.751414,...,-1.074252,1.048761,-0.123701,0.031217,-0.080712,0.924552,0.342919,-0.127997,-0.318160,-0.592956
3493,13,Татьяна Андрющенко,-0.039501,-0.262210,0.352386,0.425065,-0.359988,0.759714,-0.147618,-0.008820,...,-0.149480,1.133349,-0.123701,0.145071,-0.080712,-0.065412,-0.030263,-0.127997,-1.016918,0.410942
3494,8,Максим Кац,-0.592516,0.352321,0.940452,-0.587854,0.379796,-0.425617,-0.147618,-0.438225,...,-0.103596,-1.012122,-0.123701,-0.275733,-0.080712,0.924552,-0.107369,-0.127997,-0.580466,0.251345


In [None]:
x_train = df_train.drop(columns=['label', 'author'])
y_train = df_train['label']

In [None]:
x_test = df_test.drop(columns=['label', 'author'])
y_test = df_test['label']

## Подбор гиперараметров

In [None]:
n_estimators = [100, 500]
max_depth = [None, 5]
for param in  itertools.product(n_estimators, max_depth):
    print(f"n_estimators = {param[0]}, max_depth = {param[1]}")
    model = RandomForestClassifier(random_state=42, n_estimators = param[0], max_depth = param[1])
    train_cv(model, x_train, y_train)

n_estimators = 100, max_depth = None
{'0': {'precision': 0.27802690582959644, 'recall': 0.44711538461538464, 'f1-score': 0.34285714285714286, 'support': 416}, '1': {'precision': 0.639871382636656, 'recall': 0.47836538461538464, 'f1-score': 0.547455295735901, 'support': 416}, '2': {'precision': 0.283495145631068, 'recall': 0.35096153846153844, 'f1-score': 0.313641245972073, 'support': 416}, '3': {'precision': 0.3418803418803419, 'recall': 0.28846153846153844, 'f1-score': 0.3129074315514993, 'support': 416}, '4': {'precision': 0.20270270270270271, 'recall': 0.14423076923076922, 'f1-score': 0.16853932584269662, 'support': 416}, '5': {'precision': 0.259375, 'recall': 0.39903846153846156, 'f1-score': 0.3143939393939394, 'support': 416}, '6': {'precision': 0.2949438202247191, 'recall': 0.25240384615384615, 'f1-score': 0.27202072538860106, 'support': 416}, '7': {'precision': 0.17586206896551723, 'recall': 0.12259615384615384, 'f1-score': 0.14447592067988668, 'support': 416}, '8': {'precision'

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'0': {'precision': 0.1657010428736964, 'recall': 0.34375, 'f1-score': 0.22361219702892882, 'support': 416}, '1': {'precision': 0.29411764705882354, 'recall': 0.09615384615384616, 'f1-score': 0.14492753623188406, 'support': 416}, '2': {'precision': 0.18429003021148035, 'recall': 0.43990384615384615, 'f1-score': 0.25975869410929736, 'support': 416}, '3': {'precision': 0.20948616600790515, 'recall': 0.12740384615384615, 'f1-score': 0.1584454409566517, 'support': 416}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 416}, '5': {'precision': 0.25576923076923075, 'recall': 0.31971153846153844, 'f1-score': 0.2841880341880341, 'support': 416}, '6': {'precision': 0.18994413407821228, 'recall': 0.0819277108433735, 'f1-score': 0.11447811447811447, 'support': 415}, '7': {'precision': 0.14018691588785046, 'recall': 0.036057692307692304, 'f1-score': 0.05736137667304014, 'support': 416}, '8': {'precision': 0.23371647509578544, 'recall': 0.2932692307692308, 'f1-score': 0.2601279317

In [None]:
# очень долго
n_estimators = [100, 500]
max_depth = [None, 5]
learning_rate = [0.1, 0.5]
for param in  itertools.product(n_estimators, max_depth, learning_rate):
    print(f"n_estimators = {param[0]}, max_depth = {param[1]}, learning_rate = {param[2]}")
    model = GradientBoostingClassifier(random_state=42, n_estimators = param[0], max_depth= param[1], learning_rate = param[2])
    train_cv(model, x_train, y_train)

n_estimators = 100, max_depth = None, learning_rate = 0.1


KeyboardInterrupt: ignored

In [None]:
max_iter = [100, 500]
max_depth = [None, 5]
learning_rate = [0.1, 0.5]
for param in  itertools.product(max_iter, max_depth, learning_rate):
    print(f"max_iter = {param[0]}, max_depth = {param[1]}, learning_rate = {param[2]}")
    model = HistGradientBoostingClassifier(random_state=42, max_iter = param[0], max_depth= param[1], learning_rate = param[2])
    train_cv(model, x_train, y_train)

max_iter = 100, max_depth = None, learning_rate = 0.1
{'0': {'precision': 0.3016759776536313, 'recall': 0.3894230769230769, 'f1-score': 0.3399790136411333, 'support': 416}, '1': {'precision': 0.5384615384615384, 'recall': 0.4543269230769231, 'f1-score': 0.4928292046936115, 'support': 416}, '2': {'precision': 0.2951167728237792, 'recall': 0.33413461538461536, 'f1-score': 0.31341600901916566, 'support': 416}, '3': {'precision': 0.3418803418803419, 'recall': 0.28846153846153844, 'f1-score': 0.3129074315514993, 'support': 416}, '4': {'precision': 0.21033210332103322, 'recall': 0.13701923076923078, 'f1-score': 0.165938864628821, 'support': 416}, '5': {'precision': 0.2536764705882353, 'recall': 0.3317307692307692, 'f1-score': 0.28750000000000003, 'support': 416}, '6': {'precision': 0.2693333333333333, 'recall': 0.24278846153846154, 'f1-score': 0.2553729456384324, 'support': 416}, '7': {'precision': 0.19696969696969696, 'recall': 0.15625, 'f1-score': 0.1742627345844504, 'support': 416}, '8': 

## Обучение и оценка на тестовых данных

In [None]:
RF_clf = RandomForestClassifier(random_state=42, n_estimators = 500, max_depth = None)
train(RF_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.32081911262798635, 'recall': 0.4563106796116505, 'f1-score': 0.37675350701402804, 'support': 206}, '1': {'precision': 0.7985611510791367, 'recall': 0.5388349514563107, 'f1-score': 0.6434782608695653, 'support': 206}, '2': {'precision': 0.30859375, 'recall': 0.3853658536585366, 'f1-score': 0.34273318872017355, 'support': 205}, '3': {'precision': 0.42702702702702705, 'recall': 0.3853658536585366, 'f1-score': 0.4051282051282052, 'support': 205}, '4': {'precision': 0.24770642201834864, 'recall': 0.13170731707317074, 'f1-score': 0.17197452229299362, 'support': 205}, '5': {'precision': 0.2971014492753623, 'recall': 0.39805825242718446, 'f1-score': 0.3402489626556016, 'support': 206}, '6': {'precision': 0.33031674208144796, 'recall': 0.35436893203883496, 'f1-score': 0.34192037470725994, 'support': 206}, '7': {'precision': 0.23809523809523808, 'recall': 0.14563106796116504, 'f1-score': 0.18072289156626506, 'support': 206}, '8': {'precision': 0.5765765765765766, 'recall': 

In [None]:
# очень долго
GB_clf = GradientBoostingClassifier(random_state=42)#, n_estimators= , max_depth= , learning_rate=
train(GB_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.3007518796992481, 'recall': 0.3883495145631068, 'f1-score': 0.3389830508474576, 'support': 206}, '1': {'precision': 0.3225806451612903, 'recall': 0.1941747572815534, 'f1-score': 0.2424242424242424, 'support': 206}, '2': {'precision': 0.2631578947368421, 'recall': 0.2926829268292683, 'f1-score': 0.27713625866050806, 'support': 205}, '3': {'precision': 0.2994011976047904, 'recall': 0.24390243902439024, 'f1-score': 0.2688172043010753, 'support': 205}, '4': {'precision': 0.1941747572815534, 'recall': 0.0975609756097561, 'f1-score': 0.12987012987012989, 'support': 205}, '5': {'precision': 0.3299492385786802, 'recall': 0.3155339805825243, 'f1-score': 0.3225806451612903, 'support': 206}, '6': {'precision': 0.21548821548821548, 'recall': 0.3106796116504854, 'f1-score': 0.2544731610337972, 'support': 206}, '7': {'precision': 0.2413793103448276, 'recall': 0.10194174757281553, 'f1-score': 0.14334470989761094, 'support': 206}, '8': {'precision': 0.3263157894736842, 'recall': 

In [None]:
# This implementation is inspired by LightGBM [https://papers.nips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf]
HGB_clf = HistGradientBoostingClassifier(max_iter = 500, max_depth = None, learning_rate = 0.1)
train(HGB_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.35377358490566035, 'recall': 0.3640776699029126, 'f1-score': 0.3588516746411483, 'support': 206}, '1': {'precision': 0.7592592592592593, 'recall': 0.39805825242718446, 'f1-score': 0.5222929936305732, 'support': 206}, '2': {'precision': 0.21122994652406418, 'recall': 0.3853658536585366, 'f1-score': 0.27288428324697755, 'support': 205}, '3': {'precision': 0.3825136612021858, 'recall': 0.34146341463414637, 'f1-score': 0.36082474226804123, 'support': 205}, '4': {'precision': 0.14705882352941177, 'recall': 0.0975609756097561, 'f1-score': 0.11730205278592376, 'support': 205}, '5': {'precision': 0.26865671641791045, 'recall': 0.17475728155339806, 'f1-score': 0.21176470588235294, 'support': 206}, '6': {'precision': 0.25892857142857145, 'recall': 0.2815533980582524, 'f1-score': 0.26976744186046514, 'support': 206}, '7': {'precision': 0.20512820512820512, 'recall': 0.1553398058252427, 'f1-score': 0.1767955801104972, 'support': 206}, '8': {'precision': 0.43884892086330934, '

# Конкатенация

In [None]:
path = '/content/drive/MyDrive/Data/authors_texts/'
df_train_bert = pd.read_csv(path + "train_vectors.csv", encoding="utf-8", sep="\t")
df_test_bert = pd.read_csv(path + "test_vectors.csv", encoding="utf-8", sep="\t")
df_train_stat = pd.read_csv(path + 'data_train.csv', encoding="utf-8", sep=",")
df_test_stat = pd.read_csv(path + 'data_test.csv', encoding="utf-8", sep=",")

In [None]:
df_train = pd.concat([df_train_bert, df_train_stat], axis=1)
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,PUNCT,SCONJ,SYM,VERB,X,median_lw,mean_lw,h_min,h_median,h_mean
0,-0.015962,0.043497,2.045817,0.804832,-1.215801,0.613192,-1.435043,-0.265680,-1.558212,0.548303,...,0.521406,1.830605,-0.137011,-0.642084,-0.081813,-0.063381,-0.335791,-0.230487,-0.947507,-1.187037
1,-0.458946,-0.497003,-0.573734,0.690307,-1.117674,-0.259352,0.403433,-0.387607,-0.532740,-0.930225,...,-1.211950,-0.966596,-0.137011,-0.274048,-0.081813,0.933708,1.606128,-0.230487,-1.181633,-1.069132
2,-0.785291,-0.303447,-0.572255,-0.671411,0.459309,-0.442056,-0.545206,-0.182470,-0.967250,-0.318214,...,-0.368617,-0.966596,-0.137011,-0.799078,-0.081813,-0.063381,-0.255747,0.187281,-0.463471,0.471269
3,0.399307,0.650799,-0.838956,0.786172,1.228968,-0.373421,-0.271988,-0.819036,-0.229187,0.237831,...,-1.175570,-0.966596,-0.137011,0.868986,-0.081813,-0.063381,0.016404,-0.230487,-0.568914,-0.257906
4,0.489961,0.329807,0.509300,-1.310830,0.280187,-0.177067,-0.105956,0.513607,0.172384,-0.195174,...,1.118777,-0.160623,3.375209,-0.188397,-0.081813,-1.060471,-1.032483,0.086921,0.093118,0.286162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35344,-1.162410,-0.326580,-1.035991,-0.063080,0.456615,-0.530780,-0.451795,-0.028448,-1.359183,0.136476,...,-0.368617,1.194878,-0.137011,1.202599,-0.081813,0.435164,0.108741,-0.230487,-0.814474,0.073594
35345,-0.072586,0.102953,-0.280855,0.772763,1.166310,0.009589,0.748822,0.842557,-0.352985,-1.124103,...,-0.194035,0.862343,-0.137011,1.856994,-0.081813,-0.063381,0.333041,-0.230487,-1.182993,-0.437117
35346,0.486672,-0.670375,-0.535247,0.055219,-0.536710,-0.090790,0.533421,0.181335,0.004322,-0.676603,...,-1.054212,-0.223589,-0.137011,0.983666,-0.081813,0.933708,0.764818,-0.230487,-1.103914,-0.402584
35347,0.608807,0.001699,0.677230,-1.397119,0.131737,0.354876,0.307180,0.750435,0.375677,-0.332166,...,2.394323,2.134649,-0.137011,1.985864,-0.081813,-1.060471,-1.710528,-0.230487,0.319858,-0.013375


In [None]:
df_test = pd.concat([df_test_bert, df_test_stat], axis=1)
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,PUNCT,SCONJ,SYM,VERB,X,median_lw,mean_lw,h_min,h_median,h_mean
0,-0.003447,-0.776354,0.065085,-0.535595,-0.445510,-0.004700,-0.138245,-1.148365,-0.066790,-0.897788,...,-0.379191,-0.346628,-0.123701,-0.055468,-0.080712,-0.065412,-0.130561,0.106149,-0.163187,0.759608
1,-0.348356,-0.289477,-1.011756,-0.673179,-0.332883,-0.497255,-0.542841,0.508514,-0.114030,-0.330583,...,2.935716,-1.012122,-0.123701,0.840275,-0.080712,-2.045342,-1.898980,0.428161,2.022688,3.011519
2,0.409852,0.416799,-0.302230,1.354417,1.057421,0.162901,1.086588,-0.481688,-0.248841,0.376028,...,-1.214149,0.248813,-0.123701,0.651698,-0.080712,-0.065412,-0.377209,-0.127997,-0.523297,-0.573711
3,0.055159,-0.217921,-0.443245,-0.049989,0.169027,-0.057065,-0.051385,0.290923,1.266091,-0.457741,...,-0.546669,-1.012122,-0.123701,-1.169686,-0.080712,-0.065412,-0.216825,-0.127997,-1.009234,-2.056520
4,0.872228,0.125411,-0.266779,1.089157,0.351564,0.538745,0.872499,-0.297074,-0.312588,0.281536,...,-0.856762,0.612133,-0.123701,0.901004,-0.080712,-0.065412,-0.217483,-0.127997,-0.637274,-0.373389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3491,-0.191889,-0.486381,-1.604693,-0.406481,-0.732403,-0.081397,-0.103925,0.383288,-0.246501,-0.330857,...,0.175881,2.915380,-0.123701,-0.275733,-0.080712,-0.065412,-0.211734,-0.127997,0.416321,0.104397
3492,0.530255,0.879809,1.685876,0.626312,0.236107,0.354109,-0.799372,-0.278445,-0.744222,1.527706,...,-1.074252,1.048761,-0.123701,0.031217,-0.080712,0.924552,0.342919,-0.127997,-0.318160,-0.592956
3493,-0.041560,-0.453527,-0.466604,0.913720,-1.045459,-0.438795,0.132585,-1.116695,-0.278551,0.249478,...,-0.149480,1.133349,-0.123701,0.145071,-0.080712,-0.065412,-0.030263,-0.127997,-1.016918,0.410942
3494,0.591793,-0.300171,-0.574240,0.580180,-0.654962,-0.107691,-0.735208,-0.434162,0.715621,-0.922819,...,-0.103596,-1.012122,-0.123701,-0.275733,-0.080712,0.924552,-0.107369,-0.127997,-0.580466,0.251345


In [None]:
x_train = df_train.drop(columns=['label', 'author'])
y_train = df_train['label'].iloc[:, 0]

x_test = df_test.drop(columns=['label', 'author'])
y_test = df_test['label'].iloc[:, 0]

In [None]:
print(len(x_train))
print(len(x_train.iloc[0]))
print(len(y_train))

35349
790
35349


In [None]:
print(len(x_test))
print(len(x_test.iloc[0]))
print(len(y_test))

3496
790
3496


In [None]:
RF_clf = RandomForestClassifier(random_state=42, n_estimators = 500, max_depth = None)
train(RF_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.8153846153846154, 'recall': 0.7718446601941747, 'f1-score': 0.7930174563591021, 'support': 206}, '1': {'precision': 0.8101851851851852, 'recall': 0.8495145631067961, 'f1-score': 0.8293838862559242, 'support': 206}, '2': {'precision': 0.605, 'recall': 0.5902439024390244, 'f1-score': 0.5975308641975308, 'support': 205}, '3': {'precision': 0.7403846153846154, 'recall': 0.751219512195122, 'f1-score': 0.7457627118644068, 'support': 205}, '4': {'precision': 0.5373134328358209, 'recall': 0.526829268292683, 'f1-score': 0.5320197044334976, 'support': 205}, '5': {'precision': 0.8670212765957447, 'recall': 0.7912621359223301, 'f1-score': 0.8274111675126903, 'support': 206}, '6': {'precision': 0.7183098591549296, 'recall': 0.7427184466019418, 'f1-score': 0.730310262529833, 'support': 206}, '7': {'precision': 0.7272727272727273, 'recall': 0.6601941747572816, 'f1-score': 0.6921119592875319, 'support': 206}, '8': {'precision': 0.916256157635468, 'recall': 0.9029126213592233, 'f1

In [None]:
HGB_clf = HistGradientBoostingClassifier(max_iter = 500, max_depth = None, learning_rate = 0.1)
train(HGB_clf, x_train, y_train, x_test, y_test)

{'0': {'precision': 0.8181818181818182, 'recall': 0.7864077669902912, 'f1-score': 0.801980198019802, 'support': 206}, '1': {'precision': 0.8133971291866029, 'recall': 0.8252427184466019, 'f1-score': 0.8192771084337348, 'support': 206}, '2': {'precision': 0.5776699029126213, 'recall': 0.5804878048780487, 'f1-score': 0.5790754257907542, 'support': 205}, '3': {'precision': 0.7560975609756098, 'recall': 0.7560975609756098, 'f1-score': 0.7560975609756099, 'support': 205}, '4': {'precision': 0.51, 'recall': 0.4975609756097561, 'f1-score': 0.5037037037037037, 'support': 205}, '5': {'precision': 0.8586956521739131, 'recall': 0.7669902912621359, 'f1-score': 0.8102564102564102, 'support': 206}, '6': {'precision': 0.6926605504587156, 'recall': 0.7330097087378641, 'f1-score': 0.7122641509433962, 'support': 206}, '7': {'precision': 0.7157894736842105, 'recall': 0.6601941747572816, 'f1-score': 0.6868686868686869, 'support': 206}, '8': {'precision': 0.914572864321608, 'recall': 0.883495145631068, 'f1

# Итоговый классификатор

In [None]:
from typing import Tuple, Dict, List
from transformers import BertTokenizer, BertModel
import torch

max_len = 212

def author_classifier(text: str,
                      model: str,
                      authors: List[str] = None) -> Tuple[str, Dict[str, float]]:
    """ Функция осуществляет классификацию входного текста `text` по авторам при помощи модели `model`.

    :param text: входной текст для классификации.
    :param model: путь к модели классификации по авторам.
    :param authors: список имен авторов.

    :return:
        кортеж, в котором
        первый элемент author - строка: наиболее вероятный автор.
        второй элемент probabilities - словарь: ключ - имя автора, значение - вероятность классификации текста к данному автору.
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    authors = read_file(model + "/authors.txt")
    tokenizer = BertTokenizer.from_pretrained(model_name+'/tokenizer')
    model_bert = BertModel.from_pretrained(pretrained_model_name_or_path=model, num_labels=len(authors), return_dict = True, output_hidden_states = True)
    model_bert.to(device)
    with open(model + "/RF_clf.pkl", 'rb') as f:
        model_RF = cPickle.load(f)

    input_ids = tokenizer(text, max_length=max_len, add_special_tokens=True, padding="max_length", truncation=True, return_tensors="pt")
    outputs = model_bert(input_ids=input_ids['input_ids'].to(model_bert.device), attention_mask=input_ids['attention_mask'].to(model_bert.device), return_dict = True)
    bert_vector = outputs.hidden_states[12][0][0].cpu().detach().numpy().reshape(1, -1)
    predictions = model_RF.predict_proba(bert_vector)[0]
    author = authors[predictions.argmax()]
    probabilities = dict(zip(authors, predictions))

    return (author, probabilities)

def read_file(data_file):
    list_val = []
    with open(data_file, mode='r',encoding='utf-8') as reader:
        for s in reader:
            list_val.append(s.strip())
    print(f'Read from file {data_file} {len(list_val)} lines')
    return list_val


In [None]:
text = 'Мерещится мне, был в подвале мальчик, но еще очень маленький, лет шести или даже менее. Этот мальчик проснулся утром в сыром и холодном подвале. Одет он был в какой-то халатик и дрожал. Дыхание его вылетало белым паром, и он, сидя в углу на сундуке, от скуки нарочно пускал этот пар изо рта и забавлялся, смотря, как он вылетает. Но ему очень хотелось кушать. Он несколько раз с утра подходил к нарам, где на тонкой, как блин, подстилке и на каком-то узле под головой вместо подушки лежала больная мать его.'
author_classifier(text, '/content/drive/MyDrive/classifier_author_texts_18')

Read from file /content/drive/MyDrive/classifier_author_texts_18/authors.txt 18 lines


Some weights of the model checkpoint at /content/drive/MyDrive/classifier_author_texts_18 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  "X does not have valid feature names, but"


('Достоевский',
 {'Александр Коваленко': 0.0,
  'Артемий Лебедев': 0.01,
  'Борис Рожин': 0.02,
  'Верола': 0.01,
  'Дмитрий Пучков': 0.01,
  'Донцова': 0.01,
  'Илья Варламов': 0.0,
  'Лимонов': 0.0,
  'Максим Кац': 0.0,
  'Маринина': 0.0,
  'Пелевин': 0.0,
  'Прилепин': 0.01,
  'Александр Проханов': 0.0,
  'Татьяна Андрющенко': 0.0,
  'Юрий Подоляка': 0.0,
  'Яна Кубаева': 0.02,
  'Яна Франк': 0.0,
  'Достоевский': 0.91})

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
authors = read_file( "/content/drive/MyDrive/classifier_author_texts_18/authors.txt")
tokenizer = BertTokenizer.from_pretrained(model_name+'/tokenizer')
model_bert = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='/content/drive/MyDrive/classifier_author_texts_18', num_labels=len(authors), return_dict = True, output_hidden_states = True)
model_bert.to(device)
input_ids = tokenizer(text, max_length=max_len, add_special_tokens=True, padding="max_length", truncation=True, return_tensors="pt")
outputs = model_bert(input_ids=input_ids['input_ids'].to(model_bert.device))
with torch.no_grad():
    logits = model_bert(**input_ids.to(device)).logits
predicted_class_id = logits.argmax().item()
model_bert.config.id2label[predicted_class_id]

Read from file /content/drive/MyDrive/classifier_author_texts_18/authors.txt 18 lines


'LABEL_11'