Final trial to create a best model (well, set of models for 10 folds) to find clones between eva.ru forum users.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf

In [3]:
######## GPU CONFIGS FOR RTX 2070 ###############
## Please ignore if not training on GPU       ##
## this is important for running CuDNN on GPU ##

tf.keras.backend.clear_session() #- for easy reset of notebook state

# chck if GPU can be seen by TF
tf.config.list_physical_devices('GPU')
#tf.debugging.set_log_device_placement(True)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
###############################################

1 Physical GPUs, 1 Logical GPU


In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [5]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Nov 11 17:02:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    40W / 300W |    471MiB / 16160MiB |      5%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
import os
import pandas as pd
import numpy as np

In [7]:
#experiment duration
import time

In [8]:
#Saving into log (Excel file)
import openpyxl 
def SaveToExperimentLog(Experiments_file, LogEntry, data):
    book = openpyxl.load_workbook(Experiments_file)
    writer = pd.ExcelWriter(Experiments_file, engine='openpyxl') 
    writer.book = book

    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    data.to_excel(writer, LogEntry[0:29],index=False)

    writer.save()
    writer.close()

In [9]:
#!pip install pycm

In [10]:
#to get score metrics from the model and save in the experiment log
import pycm as cm
def model_metrics(np_confusion_matrix,class_names):
  #converting numpy array to dictionary
  d_confusion_matrix={}
  for i in range(len(class_names)):
    d_confusion_matrix[class_names[i]]=dict(zip(class_names, np_confusion_matrix[i]))
  d_confusion_matrix=eval(str(d_confusion_matrix))  
  model_cm=cm.ConfusionMatrix(matrix=d_confusion_matrix)
  return model_cm.weighted_average('F1'), model_cm.Kappa, model_cm.PPV, model_cm.TPR, model_cm.F1

In [11]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(42)

#Does not work with ktrain
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 2. Set `python` built-in pseudo-random generator at a fixed value
#random.seed(42)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(42)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(42)

In [12]:
#!pip install ktrain

In [13]:
import ktrain
from ktrain import text

In [14]:
Data = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Data/'

Messages_filename='lizon_data_for_finetuning.csv'
Messages_full_filename=os.path.join(Data, Messages_filename)

kfold=10
text_column='original_message'
target_column='target'

#known_clon_Messages_filename='lizon_clon_data_for_testg.csv'
#known_clon_Messages_full_filename=os.path.join(Data, known_clon_Messages_filename)



folds_folder='lizon_original_folds_testds'

test_Messages_filename='lizon_test.csv'#'lizon_clon_data_for_testg.csv' #'
test_Messages_full_filename= os.path.join(Data, folds_folder, test_Messages_filename) #os.path.join(Data, test_Messages_filename) #

fold_X_train_filename='lizon_X_train_%s.csv'
fold_X_valid_filename='lizon_X_valid_%s.csv'
fold_y_train_filename='lizon_y_train_%s.csv'
fold_y_valid_filename='lizon_y_valid_%s.csv'


#train_Messages_filename='lizon_data_for_finetuning_train.csv'
#train_Messages_full_filename=os.path.join(Data, train_Messages_filename)

#valid_Messages_filename='lizon_data_for_finetuning_valid.csv'
#valid_Messages_full_filename=os.path.join(Data, valid_Messages_filename)

Models = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/blinoff256/'

#Experiment
#Experiments log file 
Experiments_file='/content/drive/MyDrive/Colab Notebooks/Projects/eva/ExperimentLogs/lizon.xlsx'
Experiment_name='blinoff256' #Original test_final_folds512
#Experiment can be continued from the lines in the configuration tab (Experiment_name) without results (NewExecution=False) or started from scratch ignoring previous results (NewExecution=True)
NewExecution=False

## Experiment
Experiment is configured in an experiment log file (Excel file, in my case,  in different tabs)

In [15]:
Experiment = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_name)
Experiment['comment'].apply(str)
Experiment.tail()

Unnamed: 0,Model,fold,maxlen,batch_size,epochs,lr,method,weighted_avg_F1,kappa,lizon-precision,lizon-recall,lizon-f1-score,duration,comment,test_weighted_avg_F1,test_kappa,test_lizon-precision,test_lizon-recall,test_lizon-f1-score
5,blinoff/roberta-base-russian-v0,5,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
6,blinoff/roberta-base-russian-v0,6,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
7,blinoff/roberta-base-russian-v0,7,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
8,blinoff/roberta-base-russian-v0,8,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
9,blinoff/roberta-base-russian-v0,9,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,


## Data load and/or split

### Train/validation folds and Test data

In [16]:
#train/valid folds were created and saved in files in order, so order by time in the folder should give us the proper order pairs proper X_train-y_train/X_valid-y_valid
import glob
folds_folder_full_path = os.path.join(Data, folds_folder)

list_of_X_train_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*X_train*') ))
list_of_X_train_files.sort(key=lambda x: os.path.getmtime(x))

list_of_y_train_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*y_train*') ))
list_of_y_train_files.sort(key=lambda x: os.path.getmtime(x))

list_of_X_valid_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*X_valid*') ))
list_of_X_valid_files.sort(key=lambda x: os.path.getmtime(x))

list_of_y_valid_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*y_valid*') ))
list_of_y_valid_files.sort(key=lambda x: os.path.getmtime(x))

In [17]:
X_train_data=list()
y_train_data=list()
X_valid_data=list()
y_valid_data=list()   
for f in list_of_X_train_files:
  X_train_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[text_column].tolist() )
for f in list_of_y_train_files:
  y_train_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[target_column].tolist() )  

for f in list_of_X_valid_files:
  X_valid_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[text_column].tolist() )
for f in list_of_y_valid_files:
  y_valid_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[target_column].tolist()  )  

In [18]:
df_test=pd.read_csv(test_Messages_full_filename, error_bad_lines=False, index_col=False, usecols=[target_column, text_column])

In [19]:
#split data for train/test
#from sklearn import  model_selection
#df=pd.read_csv(Messages_full_filename, error_bad_lines=False, index_col=False, usecols=[target_column, text_column])

#df_trainvalid, df_test = model_selection.train_test_split(df, test_size=0.3, random_state=42,shuffle=True)
#df_test.to_csv(test_Messages_full_filename, header=True, index=False)

In [20]:
#split data for cross validation
#from sklearn.model_selection import StratifiedKFold
#skf = StratifiedKFold(n_splits=kfold, random_state=42, shuffle=True)
#X_train_data=list()
#y_train_data=list()
#X_valid_data=list()
#y_valid_data=list()   
#for i, (train_index, test_index) in enumerate(skf.split(df_trainvalid[text_column], df_trainvalid[target_column])):
#  print(' fold: {}  of  {} : '.format(i+1, kfold))
#  X_train, X_valid = df_trainvalid[text_column].iloc[train_index].values.astype(str), df_trainvalid[text_column].iloc[test_index].values.astype(str)
# y_train, y_valid = df_trainvalid[target_column].iloc[train_index].values.astype(str), df_trainvalid[target_column].iloc[test_index].values.astype(str)

#  X_train_data.append(X_train)  
#  fold_X_train_full_filename=os.path.join(Data, folds_folder, fold_X_train_filename%i)
#  pd.DataFrame(X_train, columns = [text_column]).to_csv(fold_X_train_full_filename, header=True, index=False)

#  X_valid_data.append(X_valid)
#  fold_X_valid_full_filename=os.path.join(Data, folds_folder, fold_X_valid_filename%i)
#  pd.DataFrame(X_valid, columns = [text_column]).to_csv(fold_X_valid_full_filename, header=True, index=False)


#  y_train_data.append(y_train)
#  fold_y_train_full_filename=os.path.join(Data, folds_folder, fold_y_train_filename%i)
#  pd.DataFrame(y_train, columns = [target_column]).to_csv(fold_y_train_full_filename, header=True, index=False)

#  y_valid_data.append(y_valid)
#  fold_y_valid_full_filename=os.path.join(Data, folds_folder, fold_y_valid_filename%i)
#  pd.DataFrame(y_valid, columns = [target_column]).to_csv(fold_y_valid_full_filename, header=True, index=False)


#  print('size of training fold: %s' % (len( X_train)))
#  unique, counts = np.unique(y_train, return_counts=True)
#  print('classes size in training fold:')
#  print(dict(zip(unique, counts)))
#  print('size of validation fold: %s' % (len(X_valid)))
#  unique, counts = np.unique(y_valid, return_counts=True)
#  print('classes size in validation fold:')
#  print(dict(zip(unique, counts)))

## Model training and evaluation

In [21]:
def fit_onecycle(MODEL_NAME, maxlen,batch_size,lr,epochs,x_train, y_train, x_valid, y_valid,ind):
  t = text.Transformer(MODEL_NAME, maxlen=maxlen)
  trn = t.preprocess_train(x_train, y_train)
  val = t.preprocess_test(x_valid, y_valid)
  test = t.preprocess_test(df_test[text_column].tolist(), df_test[target_column].tolist())
  model = t.get_classifier()    
  learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=batch_size)
  learner.fit_onecycle(lr=lr, epochs=epochs)
  predictor = ktrain.get_predictor(learner.model, preproc=t)
  Model_full_filename=os.path.join(Models, 'fit_onecycle_'+str(ind))
  predictor.save(Model_full_filename)
  
  val_confusion_matrix=learner.validate(val_data=val, class_names=t.get_classes())
  val_weighted_avg_F1, val_kappa, val_PPV, val_TPR, val_F1 = model_metrics(np_confusion_matrix=val_confusion_matrix,class_names=t.get_classes())

  test_confusion_matrix=learner.validate(val_data=test, class_names=t.get_classes())
  test_weighted_avg_F1, test_kappa, test_PPV, test_TPR, test_F1 = model_metrics(np_confusion_matrix=test_confusion_matrix,class_names=t.get_classes())
  
  return val_weighted_avg_F1, val_kappa, val_PPV['lizon'], val_TPR['lizon'], val_F1['lizon'],test_weighted_avg_F1, test_kappa, test_PPV['lizon'], test_TPR['lizon'], test_F1['lizon']

### Experiment main loop

In [22]:
for index, row in Experiment.iterrows():
  print('Processing fold %s, model %s started...'%(row['fold'], row['Model']))
  if (not(NewExecution) and row['duration'])>0:
    print('%s is already processed. Continue'%(row['Model']))
    continue  
  
  print(row)
  print('---------------------------------------------')

  try:
    ts_start = time.time()
    val_weighted_avg_F1, val_kappa, val_PPV, val_TPR, val_F1, test_weighted_avg_F1, test_kappa, test_PPV, test_TPR, test_F1 = fit_onecycle(row['Model'],row['maxlen'],row['batch_size'],row['lr'],row['epochs'],X_train_data[row['fold']],y_train_data[row['fold']],X_valid_data[row['fold']], y_valid_data[row['fold']],index)                   
    ts_end = time.time()
    Experiment.at[index,'duration']=(ts_end - ts_start)/60  
    Experiment.at[index,'weighted_avg_F1']=val_weighted_avg_F1
    Experiment.at[index,'kappa']=val_kappa
    Experiment.at[index,'lizon-precision']=val_PPV
    Experiment.at[index,'lizon-recall']=val_TPR
    Experiment.at[index,'lizon-f1-score']=val_F1

    Experiment.at[index,'test_weighted_avg_F1']=test_weighted_avg_F1
    Experiment.at[index,'test_kappa']=test_kappa
    Experiment.at[index,'test_lizon-precision']=test_PPV
    Experiment.at[index,'test_lizon-recall']=test_TPR
    Experiment.at[index,'test_lizon-f1-score']=test_F1    
  except:
    raise
    Experiment.at[index,'duration']=10000
    Experiment.at[index,'comment']='Failed'


  #---------------------------Save results to the log------
  try:
    SaveToExperimentLog(Experiments_file, Experiment_name, Experiment)
  except:
    #Continue training even if there is an issue
    print('Error saving to file!')  

Processing fold 0, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  0
maxlen                                              256
batch_size                                           16
epochs                                                5
lr                                                1e-05
method                                     fit_onecycle
weighted_avg_F1                                     NaN
kappa                                               NaN
lizon-precision                                     NaN
lizon-recall                                        NaN
lizon-f1-score                                      NaN
duration                                            NaN
comment                                             NaN
test_weighted_avg_F1                                NaN
test_kappa                                          NaN
test_lizon-precision                

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

preprocessing train...
language: ru
train sequence lengths:
	mean : 60
	95percentile : 133
	99percentile : 227


Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 123
	99percentile : 194


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      1.00      0.99       634
       lizon       0.91      0.72      0.81        43

    accuracy                           0.98       677
   macro avg       0.95      0.86      0.90       677
weighted avg       0.98      0.98      0.98       677

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.88      0.80      0.84       181

    accuracy                           0.98      2900
   macro avg       0.94      0.90      0.92      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 1, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  1
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 68
	95percentile : 142
	99percentile : 234


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      0.99      0.99       634
       lizon       0.85      0.81      0.83        43

    accuracy                           0.98       677
   macro avg       0.92      0.90      0.91       677
weighted avg       0.98      0.98      0.98       677

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.88      0.80      0.84       181

    accuracy                           0.98      2900
   macro avg       0.94      0.90      0.92      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 2, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  2
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 60
	95percentile : 137
	99percentile : 228


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      1.00      0.99       634
       lizon       0.97      0.79      0.87        43

    accuracy                           0.99       677
   macro avg       0.98      0.89      0.93       677
weighted avg       0.99      0.99      0.98       677

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.88      0.83      0.85       181

    accuracy                           0.98      2900
   macro avg       0.94      0.91      0.92      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 3, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  3
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 58
	95percentile : 133
	99percentile : 184


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      1.00      0.99       633
       lizon       0.92      0.80      0.85        44

    accuracy                           0.98       677
   macro avg       0.95      0.90      0.92       677
weighted avg       0.98      0.98      0.98       677

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.89      0.80      0.84       181

    accuracy                           0.98      2900
   macro avg       0.94      0.89      0.91      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 4, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  4
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 134
	99percentile : 205


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      1.00      0.99       633
       lizon       0.91      0.73      0.81        44

    accuracy                           0.98       677
   macro avg       0.95      0.86      0.90       677
weighted avg       0.98      0.98      0.98       677

              precision    recall  f1-score   support

       Other       0.99      1.00      0.99      2719
       lizon       0.94      0.80      0.86       181

    accuracy                           0.98      2900
   macro avg       0.96      0.90      0.93      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 5, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  5
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 58
	95percentile : 130
	99percentile : 237


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      0.99      0.99       633
       lizon       0.90      0.84      0.87        43

    accuracy                           0.98       676
   macro avg       0.94      0.92      0.93       676
weighted avg       0.98      0.98      0.98       676

              precision    recall  f1-score   support

       Other       0.99      1.00      0.99      2719
       lizon       0.94      0.81      0.87       181

    accuracy                           0.98      2900
   macro avg       0.96      0.90      0.93      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 6, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  6
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 129
	99percentile : 218


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       1.00      0.99      0.99       633
       lizon       0.89      0.93      0.91        43

    accuracy                           0.99       676
   macro avg       0.94      0.96      0.95       676
weighted avg       0.99      0.99      0.99       676

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.87      0.81      0.84       181

    accuracy                           0.98      2900
   macro avg       0.93      0.90      0.91      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 7, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  7
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 60
	95percentile : 131
	99percentile : 255


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      0.99      0.99       633
       lizon       0.85      0.81      0.83        43

    accuracy                           0.98       676
   macro avg       0.92      0.90      0.91       676
weighted avg       0.98      0.98      0.98       676

              precision    recall  f1-score   support

       Other       0.99      1.00      0.99      2719
       lizon       0.92      0.78      0.84       181

    accuracy                           0.98      2900
   macro avg       0.95      0.89      0.92      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 8, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  8
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 126
	99percentile : 222


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      0.99      0.99       633
       lizon       0.89      0.91      0.90        43

    accuracy                           0.99       676
   macro avg       0.94      0.95      0.94       676
weighted avg       0.99      0.99      0.99       676

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.88      0.83      0.86       181

    accuracy                           0.98      2900
   macro avg       0.94      0.91      0.92      2900
weighted avg       0.98      0.98      0.98      2900

Processing fold 9, model blinoff/roberta-base-russian-v0 started...
Model                   blinoff/roberta-base-russian-v0
fold                                                  9
maxlen                                              2

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 200


preprocessing test...
language: ru
test sequence lengths:
	mean : 59
	95percentile : 132
	99percentile : 198


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.99      0.99      0.99       633
       lizon       0.85      0.79      0.82        43

    accuracy                           0.98       676
   macro avg       0.92      0.89      0.90       676
weighted avg       0.98      0.98      0.98       676

              precision    recall  f1-score   support

       Other       0.99      0.99      0.99      2719
       lizon       0.91      0.80      0.85       181

    accuracy                           0.98      2900
   macro avg       0.95      0.90      0.92      2900
weighted avg       0.98      0.98      0.98      2900



In [23]:
result=pd.DataFrame()
folds = [str(i) for i in range(kfold)]
for model in Experiment['Model'].unique():
  for score in ['lizon-precision', 'lizon-recall','lizon-f1-score','test_lizon-precision', 'test_lizon-recall','test_lizon-f1-score','weighted_avg_F1','kappa','test_weighted_avg_F1','test_kappa']:
    df = Experiment[Experiment['Model']==model][[score]].transpose().reset_index()
    df['Model']=model
    df.columns=['Score']+folds+['Model']
    df=df[['Model','Score'] + folds]
    result=result.append(df)

In [24]:
result.reset_index(inplace=True)
result=result[['Model','Score'] + folds]

In [25]:
result['mean']=result[folds].mean(axis=1)
result['sem']=result[folds].sem(axis=1)
result['std']=result[folds].std(axis=1)
result['median']=result[folds].median(axis=1)

In [26]:
result

Unnamed: 0,Model,Score,0,1,2,3,4,5,6,7,8,9,mean,sem,std,median
0,blinoff/roberta-base-russian-v0,lizon-precision,0.911765,0.853659,0.971429,0.921053,0.914286,0.9,0.888889,0.853659,0.886364,0.85,0.89511,0.011904,0.037643,0.894444
1,blinoff/roberta-base-russian-v0,lizon-recall,0.72093,0.813953,0.790698,0.795455,0.727273,0.837209,0.930233,0.813953,0.906977,0.790698,0.812738,0.021158,0.066909,0.804704
2,blinoff/roberta-base-russian-v0,lizon-f1-score,0.805195,0.833333,0.871795,0.853659,0.810127,0.86747,0.909091,0.833333,0.896552,0.819277,0.849983,0.011328,0.035822,0.843496
3,blinoff/roberta-base-russian-v0,test_lizon-precision,0.884146,0.884146,0.882353,0.888889,0.941176,0.935897,0.874251,0.915584,0.883041,0.90625,0.899574,0.007566,0.023927,0.886518
4,blinoff/roberta-base-russian-v0,test_lizon-recall,0.801105,0.801105,0.828729,0.79558,0.79558,0.80663,0.80663,0.779006,0.834254,0.801105,0.804972,0.00508,0.016065,0.801105
5,blinoff/roberta-base-russian-v0,test_lizon-f1-score,0.84058,0.84058,0.854701,0.83965,0.862275,0.866469,0.83908,0.841791,0.857955,0.85044,0.849352,0.003291,0.010406,0.846115
6,blinoff/roberta-base-russian-v0,weighted_avg_F1,0.976627,0.979091,0.984518,0.981668,0.97666,0.983453,0.988291,0.97906,0.986758,0.977436,0.981356,0.001341,0.004242,0.980379
7,blinoff/roberta-base-russian-v0,kappa,0.793619,0.822316,0.864045,0.844278,0.798524,0.858814,0.902765,0.822299,0.889438,0.807473,0.840357,0.011967,0.037842,0.833297
8,blinoff/roberta-base-russian-v0,test_weighted_avg_F1,0.980597,0.980597,0.982156,0.980542,0.983514,0.983943,0.980326,0.981033,0.98253,0.981906,0.981714,0.000413,0.001307,0.981469
9,blinoff/roberta-base-russian-v0,test_kappa,0.830523,0.830523,0.845351,0.829604,0.853923,0.85828,0.828827,0.83216,0.848785,0.841135,0.839911,0.00351,0.011101,0.836647


In [27]:
import scipy.stats as stats
#Significance level
alpha=0.05
confidence = 1 - alpha

In [28]:
result['Shapiro test']=''
result['Confidence Interval (+/-)']=''

In [29]:
for index, row in result.iterrows():
  AnalyzedModelResults=row[folds].values.tolist()
  shapiro_test = stats.shapiro(AnalyzedModelResults)
  shapiro_test_pvalue = shapiro_test[1]
  if shapiro_test_pvalue < alpha:
    result.at[index,'Shapiro test'] = 'The null hypothesis that the score are normally distributed is rejected with alpha=%s'%alpha
  else:
    result.at[index,'Shapiro test'] = 'The score is normally distributed with alpha=%s'%alpha
    result.at[index,'Confidence Interval (+/-)'] = row['sem'] * stats.t.ppf((1 + confidence) / 2., kfold-1)

In [30]:
result

Unnamed: 0,Model,Score,0,1,2,3,4,5,6,7,8,9,mean,sem,std,median,Shapiro test,Confidence Interval (+/-)
0,blinoff/roberta-base-russian-v0,lizon-precision,0.911765,0.853659,0.971429,0.921053,0.914286,0.9,0.888889,0.853659,0.886364,0.85,0.89511,0.011904,0.037643,0.894444,The score is normally distributed with alpha=0.05,0.0269283
1,blinoff/roberta-base-russian-v0,lizon-recall,0.72093,0.813953,0.790698,0.795455,0.727273,0.837209,0.930233,0.813953,0.906977,0.790698,0.812738,0.021158,0.066909,0.804704,The score is normally distributed with alpha=0.05,0.0478636
2,blinoff/roberta-base-russian-v0,lizon-f1-score,0.805195,0.833333,0.871795,0.853659,0.810127,0.86747,0.909091,0.833333,0.896552,0.819277,0.849983,0.011328,0.035822,0.843496,The score is normally distributed with alpha=0.05,0.0256258
3,blinoff/roberta-base-russian-v0,test_lizon-precision,0.884146,0.884146,0.882353,0.888889,0.941176,0.935897,0.874251,0.915584,0.883041,0.90625,0.899574,0.007566,0.023927,0.886518,The null hypothesis that the score are normall...,
4,blinoff/roberta-base-russian-v0,test_lizon-recall,0.801105,0.801105,0.828729,0.79558,0.79558,0.80663,0.80663,0.779006,0.834254,0.801105,0.804972,0.00508,0.016065,0.801105,The score is normally distributed with alpha=0.05,0.0114925
5,blinoff/roberta-base-russian-v0,test_lizon-f1-score,0.84058,0.84058,0.854701,0.83965,0.862275,0.866469,0.83908,0.841791,0.857955,0.85044,0.849352,0.003291,0.010406,0.846115,The score is normally distributed with alpha=0.05,0.00744378
6,blinoff/roberta-base-russian-v0,weighted_avg_F1,0.976627,0.979091,0.984518,0.981668,0.97666,0.983453,0.988291,0.97906,0.986758,0.977436,0.981356,0.001341,0.004242,0.980379,The score is normally distributed with alpha=0.05,0.00303459
7,blinoff/roberta-base-russian-v0,kappa,0.793619,0.822316,0.864045,0.844278,0.798524,0.858814,0.902765,0.822299,0.889438,0.807473,0.840357,0.011967,0.037842,0.833297,The score is normally distributed with alpha=0.05,0.0270707
8,blinoff/roberta-base-russian-v0,test_weighted_avg_F1,0.980597,0.980597,0.982156,0.980542,0.983514,0.983943,0.980326,0.981033,0.98253,0.981906,0.981714,0.000413,0.001307,0.981469,The score is normally distributed with alpha=0.05,0.000934639
9,blinoff/roberta-base-russian-v0,test_kappa,0.830523,0.830523,0.845351,0.829604,0.853923,0.85828,0.828827,0.83216,0.848785,0.841135,0.839911,0.00351,0.011101,0.836647,The score is normally distributed with alpha=0.05,0.00794086


In [31]:
SaveToExperimentLog(Experiments_file, Experiment_name+' Result', result)