Can a transformers model distinguish posts published by eva.ru forum user-clones? Yes, it can with high quality, but, of course, the more messages, the better. Minimum 1000 messages is required to get a reasonable F1 score.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec  2 17:52:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    22W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import os
import pandas as pd
import numpy as np

In [6]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(42)

#Does not work with ktrain
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 2. Set `python` built-in pseudo-random generator at a fixed value
#random.seed(42)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(42)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(42)

In [None]:
!pip install ktrain

In [8]:
import ktrain
from ktrain import text

In [9]:
Data = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Data/'

clone_Messages_filename='lizon_clone_data_for_test.csv'
clone_Messages_full_filename=os.path.join(Data, clone_Messages_filename)

lizon_Messages_filename='lizon_data_for_finetuning.csv'
lizon_Messages_full_filename=os.path.join(Data, lizon_Messages_filename)

lizon_ids=[69715,
300514,
100899,
424711,
472696,
475085,
711697,
731728]

Authors_filename='Authors.csv'
Authors_full_filename= os.path.join(Data, Authors_filename)

## Data load and/or split

In [10]:
clone_df=pd.read_csv(clone_Messages_full_filename, error_bad_lines=False, index_col=False)

In [11]:
lizon_df=pd.read_csv(lizon_Messages_full_filename, error_bad_lines=False, index_col=False)

In [12]:
#to get one author name per Author_Id (they can vary in messages)
Authors=pd.read_csv(Authors_full_filename, error_bad_lines=False, index_col=False)

In [13]:
clone_df = clone_df[clone_df['Author_Id'].isin(lizon_ids)][['Author_Id','original_message']].copy(deep=True)

In [14]:
lizon_df = lizon_df[lizon_df['Author_Id'].isin(lizon_ids)][['Author_Id','original_message']].copy(deep=True)

In [15]:
df = lizon_df.append(clone_df)

In [16]:
df = pd.merge(Authors[['Author_Id','author']], df, left_on='Author_Id', right_on='Author_Id', how='inner')

In [17]:
#split data for train/test
from sklearn import  model_selection

df_trainvalid, df_test = model_selection.train_test_split(df, test_size=0.3, random_state=42,shuffle=True)
df_train, df_valid  = model_selection.train_test_split(df_trainvalid, test_size=0.3, random_state=42,shuffle=True)

In [18]:
df0 = df.groupby(['author']).size().reset_index(name='cnt_total')
df1 = df_train.groupby(['author']).size().reset_index(name='cnt_train')
df2 = df_valid.groupby(['author']).size().reset_index(name='cnt_valid')
df3 = df_test.groupby(['author']).size().reset_index(name='cnt_test')
df4 = pd.merge(df0, df1, left_on='author', right_on='author', how='inner')
df5 = pd.merge(df4, df2, left_on='author', right_on='author', how='inner')
results= pd.merge(df5, df3, left_on='author', right_on='author', how='inner')
results

Unnamed: 0,author,cnt_total,cnt_train,cnt_valid,cnt_test
0,Joconda Mona Liza _,227,113,43,71
1,"Lizon "" **K**",555,263,122,170
2,Lizonn +,79,39,13,27
3,lizlizon *,322,152,70,100
4,lizon **,613,277,119,217
5,newyorck D,1045,514,223,308
6,Кассиапея D,635,318,136,181
7,Элиззи H*,2054,1033,436,585


In [19]:
x_train = df_train['original_message'].values.astype(str)
x_valid = df_valid['original_message'].values.astype(str)
x_test = df_test['original_message'].values.astype(str)
## get target
y_train = df_train['author'].values.astype(str)
y_valid = df_valid['author'].values.astype(str)
y_test = df_test['author'].values.astype(str)

## Model Training

In [20]:
MODEL_NAME = 'blinoff/roberta-base-russian-v0'
t = text.Transformer(MODEL_NAME, maxlen=256)

In [21]:
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_valid, y_valid)
test = t.preprocess_test(x_test, y_test)

preprocessing train...
language: ru
train sequence lengths:
	mean : 80
	95percentile : 201
	99percentile : 360


Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 88
	95percentile : 229
	99percentile : 411


preprocessing test...
language: ru
test sequence lengths:
	mean : 79
	95percentile : 198
	99percentile : 355


In [22]:
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)
learner.fit_onecycle(lr=1e-5, epochs=10)

404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe18d574850>

## Results

In [23]:
test_confusion_matrix=learner.validate(val_data=test, class_names=t.get_classes())

                     precision    recall  f1-score   support

Joconda Mona Liza _       0.00      0.00      0.00        71
      Lizon " **K**       0.30      0.39      0.34       170
           Lizonn +       0.00      0.00      0.00        27
         lizlizon *       0.16      0.03      0.05       100
           lizon **       0.44      0.38      0.41       217
         newyorck D       0.55      0.59      0.57       308
        Кассиапея D       0.22      0.01      0.02       181
          Элиззи H*       0.50      0.77      0.61       585

           accuracy                           0.47      1659
          macro avg       0.27      0.27      0.25      1659
       weighted avg       0.40      0.47      0.41      1659



  _warn_prf(average, modifier, msg_start, len(result))


### Confusion Matrix

In [24]:
test_confusion_matrix_df=pd.DataFrame(data=test_confusion_matrix,index=t.get_classes(),columns=t.get_classes())
test_confusion_matrix_df

Unnamed: 0,Joconda Mona Liza _,"Lizon "" **K**",Lizonn +,lizlizon *,lizon **,newyorck D,Кассиапея D,Элиззи H*
Joconda Mona Liza _,0,16,0,2,14,4,2,33
"Lizon "" **K**",0,66,0,3,38,10,0,53
Lizonn +,0,1,0,0,2,9,1,14
lizlizon *,0,21,0,3,17,4,0,55
lizon **,0,63,0,2,83,20,0,49
newyorck D,0,12,0,1,8,181,2,104
Кассиапея D,0,11,0,1,10,19,2,138
Элиззи H*,0,27,0,7,16,85,2,448


### Precision, Recall and F1

In [25]:
!pip install pycm



In [26]:
import pycm as cm
d_confusion_matrix={}
for i in range(len(t.get_classes())):
  d_confusion_matrix[t.get_classes()[i]]=dict(zip(t.get_classes(), test_confusion_matrix[i]))
d_confusion_matrix=eval(str(d_confusion_matrix))  
model_cm=cm.ConfusionMatrix(matrix=d_confusion_matrix)

In [30]:
results['Precision'] = results['author'].map(model_cm.PPV)
results['Recall'] = results['author'].map(model_cm.TPR)
results['F1'] = results['author'].map(model_cm.F1)
results.sort_values('cnt_total',ascending=False)

Unnamed: 0,author,cnt_total,cnt_train,cnt_valid,cnt_test,Precision,Recall,F1
7,Элиззи H*,2054,1033,436,585,0.501119,0.765812,0.605815
5,newyorck D,1045,514,223,308,0.545181,0.587662,0.565625
6,Кассиапея D,635,318,136,181,0.222222,0.01105,0.021053
4,lizon **,613,277,119,217,0.441489,0.382488,0.409877
1,"Lizon "" **K**",555,263,122,170,0.304147,0.388235,0.341085
3,lizlizon *,322,152,70,100,0.157895,0.03,0.05042
0,Joconda Mona Liza _,227,113,43,71,,0.0,0.0
2,Lizonn +,79,39,13,27,,0.0,0.0


### Weighted F1

In [28]:
model_cm.weighted_average('F1')

0.41253428079634663

### Kappa

In [29]:
model_cm.Kappa

0.28963251898994047