## Only 2, largest lizon's accounts are compared

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Dec  3 02:38:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import os
import pandas as pd
import numpy as np

In [6]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(42)

#Does not work with ktrain
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 2. Set `python` built-in pseudo-random generator at a fixed value
#random.seed(42)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(42)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(42)

In [None]:
!pip install ktrain

In [8]:
import ktrain
from ktrain import text

In [9]:
Data = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Data/'

clone_Messages_filename='lizon_clone_data_for_test.csv'
clone_Messages_full_filename=os.path.join(Data, clone_Messages_filename)

lizon_Messages_filename='lizon_data_for_finetuning.csv'
lizon_Messages_full_filename=os.path.join(Data, lizon_Messages_filename)

lizon_ids=[475085,731728]

Authors_filename='Authors.csv'
Authors_full_filename= os.path.join(Data, Authors_filename)

## Data load and/or split

In [10]:
clone_df=pd.read_csv(clone_Messages_full_filename, error_bad_lines=False, index_col=False)

In [11]:
lizon_df=pd.read_csv(lizon_Messages_full_filename, error_bad_lines=False, index_col=False)

In [12]:
#to get one author name per Author_Id (they can vary in messages)
Authors=pd.read_csv(Authors_full_filename, error_bad_lines=False, index_col=False)

In [13]:
clone_df = clone_df[clone_df['Author_Id'].isin(lizon_ids)][['Author_Id','original_message']].copy(deep=True)

In [14]:
lizon_df = lizon_df[lizon_df['Author_Id'].isin(lizon_ids)][['Author_Id','original_message']].copy(deep=True)

In [15]:
df = lizon_df.append(clone_df)

In [16]:
df = pd.merge(Authors[['Author_Id','author']], df, left_on='Author_Id', right_on='Author_Id', how='inner')

In [19]:
#split data for train/test
from sklearn import  model_selection

df_trainvalid, df_test = model_selection.train_test_split(df, test_size=0.3, random_state=42,shuffle=True)
df_train, df_valid  = model_selection.train_test_split(df_trainvalid, test_size=0.3, random_state=42,shuffle=True)

In [20]:
df0 = df.groupby(['author']).size().reset_index(name='cnt_total')
df1 = df_train.groupby(['author']).size().reset_index(name='cnt_train')
df2 = df_valid.groupby(['author']).size().reset_index(name='cnt_valid')
df3 = df_test.groupby(['author']).size().reset_index(name='cnt_test')
df4 = pd.merge(df0, df1, left_on='author', right_on='author', how='inner')
df5 = pd.merge(df4, df2, left_on='author', right_on='author', how='inner')
results= pd.merge(df5, df3, left_on='author', right_on='author', how='inner')
results

Unnamed: 0,author,cnt_total,cnt_train,cnt_valid,cnt_test
0,newyorck D,1045,511,210,324
1,Элиззи H*,2054,1007,441,606


In [21]:
x_train = df_train['original_message'].values.astype(str)
x_valid = df_valid['original_message'].values.astype(str)
x_test = df_test['original_message'].values.astype(str)
## get target
y_train = df_train['author'].values.astype(str)
y_valid = df_valid['author'].values.astype(str)
y_test = df_test['author'].values.astype(str)

## Model Training

In [22]:
MODEL_NAME = 'blinoff/roberta-base-russian-v0'
t = text.Transformer(MODEL_NAME, maxlen=256)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [23]:
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_valid, y_valid)
test = t.preprocess_test(x_test, y_test)

preprocessing train...
language: ru
train sequence lengths:
	mean : 88
	95percentile : 230
	99percentile : 408


Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 86
	95percentile : 221
	99percentile : 398


preprocessing test...
language: ru
test sequence lengths:
	mean : 93
	95percentile : 226
	99percentile : 415


In [24]:
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)
learner.fit_onecycle(lr=1e-5, epochs=5)

404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5bdf3c4d10>

## Results

In [25]:
test_confusion_matrix=learner.validate(val_data=test, class_names=t.get_classes())

              precision    recall  f1-score   support

  newyorck D       0.72      0.58      0.64       324
   Элиззи H*       0.80      0.88      0.83       606

    accuracy                           0.77       930
   macro avg       0.76      0.73      0.74       930
weighted avg       0.77      0.77      0.77       930



### Confusion Matrix

In [26]:
test_confusion_matrix_df=pd.DataFrame(data=test_confusion_matrix,index=t.get_classes(),columns=t.get_classes())
test_confusion_matrix_df

Unnamed: 0,newyorck D,Элиззи H*
newyorck D,187,137
Элиззи H*,74,532


### Precision, Recall and F1

In [27]:
!pip install pycm

Collecting pycm
  Downloading pycm-3.3-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.5 MB/s eta 0:00:011
Collecting art>=1.8
  Downloading art-5.3-py2.py3-none-any.whl (574 kB)
[K     |████████████████████████████████| 574 kB 8.7 MB/s 
[?25hInstalling collected packages: art, pycm
Successfully installed art-5.3 pycm-3.3


In [28]:
import pycm as cm
d_confusion_matrix={}
for i in range(len(t.get_classes())):
  d_confusion_matrix[t.get_classes()[i]]=dict(zip(t.get_classes(), test_confusion_matrix[i]))
d_confusion_matrix=eval(str(d_confusion_matrix))  
model_cm=cm.ConfusionMatrix(matrix=d_confusion_matrix)

In [29]:
results['Precision'] = results['author'].map(model_cm.PPV)
results['Recall'] = results['author'].map(model_cm.TPR)
results['F1'] = results['author'].map(model_cm.F1)
results.sort_values('cnt_total',ascending=False)

Unnamed: 0,author,cnt_total,cnt_train,cnt_valid,cnt_test,Precision,Recall,F1
1,Элиззи H*,2054,1007,441,606,0.795217,0.877888,0.83451
0,newyorck D,1045,511,210,324,0.716475,0.57716,0.639316


### Weighted F1

In [30]:
model_cm.weighted_average('F1')

0.766506884639712

### Kappa

In [31]:
model_cm.Kappa

0.4766111351161041