The notebook creates a model (well, rather set of models for sever data folds) to find clones between eva.ru forum users based on user fedulya.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf

In [None]:
######## GPU CONFIGS FOR RTX 2070 ###############
## Please ignore if not training on GPU       ##
## this is important for running CuDNN on GPU ##

tf.keras.backend.clear_session() #- for easy reset of notebook state

# chck if GPU can be seen by TF
tf.config.list_physical_devices('GPU')
#tf.debugging.set_log_device_placement(True)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
###############################################

1 Physical GPUs, 1 Logical GPU


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Jan  3 20:26:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    32W / 250W |    375MiB / 16280MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
#experiment duration
import time

In [None]:
#Saving into log (Excel file)
import openpyxl 
def SaveToExperimentLog(Experiments_file, LogEntry, data):
    book = openpyxl.load_workbook(Experiments_file)
    writer = pd.ExcelWriter(Experiments_file, engine='openpyxl') 
    writer.book = book

    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    data.to_excel(writer, LogEntry[0:29],index=False)

    writer.save()
    writer.close()

In [None]:
!pip install pycm

Collecting pycm
  Downloading pycm-3.3-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.5 MB/s eta 0:00:011
Collecting art>=1.8
  Downloading art-5.4-py2.py3-none-any.whl (580 kB)
[K     |████████████████████████████████| 580 kB 15.3 MB/s eta 0:00:01
[?25hInstalling collected packages: art, pycm
Successfully installed art-5.4 pycm-3.3


In [None]:
#to get score metrics from the model and save in the experiment log
import pycm as cm
def model_metrics(np_confusion_matrix,class_names):
  #converting numpy array to dictionary
  d_confusion_matrix={}
  for i in range(len(class_names)):
    d_confusion_matrix[class_names[i]]=dict(zip(class_names, np_confusion_matrix[i]))
  d_confusion_matrix=eval(str(d_confusion_matrix))  
  model_cm=cm.ConfusionMatrix(matrix=d_confusion_matrix)
  return model_cm.weighted_average('F1'), model_cm.Kappa, model_cm.PPV, model_cm.TPR, model_cm.F1

In [None]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(42)

#Does not work with ktrain
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 2. Set `python` built-in pseudo-random generator at a fixed value
#random.seed(42)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(42)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(42)

In [None]:
!pip install ktrain

In [None]:
import ktrain
from ktrain import text

In [None]:
Data = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Data/'

Messages_filename='fedulya_data_for_finetuning.csv'
Messages_full_filename=os.path.join(Data, Messages_filename)

kfold=10
text_column='message'
target_column='target'

#known_clon_Messages_filename='fedulya_clon_data_for_testg.csv'
#known_clon_Messages_full_filename=os.path.join(Data, known_clon_Messages_filename)



folds_folder='fedulya_folds_testds'

test_Messages_filename='fedulya_test.csv'#'fedulya_clon_data_for_testg.csv' #'
test_Messages_full_filename= os.path.join(Data, folds_folder, test_Messages_filename) #os.path.join(Data, test_Messages_filename) #

fold_X_train_filename='fedulya_X_train_%s.csv'
fold_X_valid_filename='fedulya_X_valid_%s.csv'
fold_y_train_filename='fedulya_y_train_%s.csv'
fold_y_valid_filename='fedulya_y_valid_%s.csv'


train_Messages_filename='fedulya_data_for_finetuning_train.csv'
train_Messages_full_filename=os.path.join(Data, train_Messages_filename)

#valid_Messages_filename='fedulya_data_for_finetuning_valid.csv'
#valid_Messages_full_filename=os.path.join(Data, valid_Messages_filename)

 

#Experiment
#Experiments log file 
Experiments_file='/content/drive/MyDrive/Colab Notebooks/Projects/eva/ExperimentLogs/fedulya.xlsx'
Experiment_name='message' 
#Experiment can be continued from the lines in the configuration tab (Experiment_name) without results (NewExecution=False) or started from scratch ignoring previous results (NewExecution=True)
NewExecution=False

## Experiment
Experiment is configured in an experiment log file (Excel file, in my case,  in different tabs)

In [None]:
Experiment = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_name)
Experiment['comment'].apply(str)
Experiment.tail()

Unnamed: 0,Model,fold,maxlen,batch_size,epochs,lr,method,weighted_avg_F1,kappa,fedulya-precision,fedulya-recall,fedulya-f1-score,duration,comment,test_weighted_avg_F1,test_kappa,test_fedulya-precision,test_fedulya-recall,test_fedulya-f1-score
5,blinoff/roberta-base-russian-v0,5,256,16,5,1e-05,fit_onecycle,0.964645,0.803923,0.841935,0.805556,0.823344,86.034477,,0.968316,0.825326,0.86717,0.819544,0.842684
6,blinoff/roberta-base-russian-v0,6,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
7,blinoff/roberta-base-russian-v0,7,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
8,blinoff/roberta-base-russian-v0,8,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,
9,blinoff/roberta-base-russian-v0,9,256,16,5,1e-05,fit_onecycle,,,,,,,,,,,,


## Data load and/or split

### Train/validation folds and Test data

In [None]:
#train/valid folds were created and saved in files in order, so order by time in the folder should give us the proper order pairs proper X_train-y_train/X_valid-y_valid
import glob
folds_folder_full_path = os.path.join(Data, folds_folder)

list_of_X_train_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*X_train*') ))
list_of_X_train_files.sort(key=lambda x: os.path.getmtime(x))

list_of_y_train_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*y_train*') ))
list_of_y_train_files.sort(key=lambda x: os.path.getmtime(x))

list_of_X_valid_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*X_valid*') ))
list_of_X_valid_files.sort(key=lambda x: os.path.getmtime(x))

list_of_y_valid_files = list(filter( os.path.isfile,glob.glob(folds_folder_full_path + '/*y_valid*') ))
list_of_y_valid_files.sort(key=lambda x: os.path.getmtime(x))

In [None]:
X_train_data=list()
y_train_data=list()
X_valid_data=list()
y_valid_data=list()   
for f in list_of_X_train_files:
  X_train_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[text_column].tolist() )
for f in list_of_y_train_files:
  y_train_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[target_column].tolist() )  

for f in list_of_X_valid_files:
  X_valid_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[text_column].tolist() )
for f in list_of_y_valid_files:
  y_valid_data.append(pd.read_csv(os.path.join(f), error_bad_lines=False, index_col=False)[target_column].tolist()  )  

In [None]:
df_test=pd.read_csv(test_Messages_full_filename, error_bad_lines=False, index_col=False, usecols=[target_column, text_column])

In [None]:
#split data for train/test
#from sklearn import  model_selection
#df=pd.read_csv(Messages_full_filename, error_bad_lines=False, index_col=False, usecols=[target_column, text_column])

#df_trainvalid, df_test = model_selection.train_test_split(df, test_size=0.3, random_state=42,shuffle=True)
#df_test.to_csv(test_Messages_full_filename, header=True, index=False)

In [None]:
#split data for cross validation
#from sklearn.model_selection import StratifiedKFold
#skf = StratifiedKFold(n_splits=kfold, random_state=42, shuffle=True)
#X_train_data=list()
#y_train_data=list()
#X_valid_data=list()
#y_valid_data=list()   
#for i, (train_index, test_index) in enumerate(skf.split(df_trainvalid[text_column], df_trainvalid[target_column])):
#  print(' fold: {}  of  {} : '.format(i+1, kfold))
#  X_train, X_valid = df_trainvalid[text_column].iloc[train_index].values.astype(str), df_trainvalid[text_column].iloc[test_index].values.astype(str)
#  y_train, y_valid = df_trainvalid[target_column].iloc[train_index].values.astype(str), df_trainvalid[target_column].iloc[test_index].values.astype(str)

#  X_train_data.append(X_train)  
#  fold_X_train_full_filename=os.path.join(Data, folds_folder, fold_X_train_filename%i)
#  pd.DataFrame(X_train, columns = [text_column]).to_csv(fold_X_train_full_filename, header=True, index=False)

#  X_valid_data.append(X_valid)
#  fold_X_valid_full_filename=os.path.join(Data, folds_folder, fold_X_valid_filename%i)
#  pd.DataFrame(X_valid, columns = [text_column]).to_csv(fold_X_valid_full_filename, header=True, index=False)


#  y_train_data.append(y_train)
#  fold_y_train_full_filename=os.path.join(Data, folds_folder, fold_y_train_filename%i)
#  pd.DataFrame(y_train, columns = [target_column]).to_csv(fold_y_train_full_filename, header=True, index=False)

#  y_valid_data.append(y_valid)
#  fold_y_valid_full_filename=os.path.join(Data, folds_folder, fold_y_valid_filename%i)
#  pd.DataFrame(y_valid, columns = [target_column]).to_csv(fold_y_valid_full_filename, header=True, index=False)


#  print('size of training fold: %s' % (len( X_train)))
#  unique, counts = np.unique(y_train, return_counts=True)
#  print('classes size in training fold:')
#  print(dict(zip(unique, counts)))
#  print('size of validation fold: %s' % (len(X_valid)))
#  unique, counts = np.unique(y_valid, return_counts=True)
#  print('classes size in validation fold:')
#  print(dict(zip(unique, counts)))

## Model training and evaluation

In [None]:
def fit_onecycle(MODEL_NAME, maxlen,batch_size,lr,epochs,x_train, y_train, x_valid, y_valid,ind):
  t = text.Transformer(MODEL_NAME, maxlen=maxlen)
  trn = t.preprocess_train(x_train, y_train)
  val = t.preprocess_test(x_valid, y_valid)
  test = t.preprocess_test(df_test[text_column].tolist(), df_test[target_column].tolist())
  model = t.get_classifier()    
  learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=batch_size)
  learner.fit_onecycle(lr=lr, epochs=epochs)
  predictor = ktrain.get_predictor(learner.model, preproc=t)
  Model_full_filename=os.path.join(Models, 'fit_onecycle_'+str(ind))
  predictor.save(Model_full_filename)
  
  val_confusion_matrix=learner.validate(val_data=val, class_names=t.get_classes())
  val_weighted_avg_F1, val_kappa, val_PPV, val_TPR, val_F1 = model_metrics(np_confusion_matrix=val_confusion_matrix,class_names=t.get_classes())

  test_confusion_matrix=learner.validate(val_data=test, class_names=t.get_classes())
  test_weighted_avg_F1, test_kappa, test_PPV, test_TPR, test_F1 = model_metrics(np_confusion_matrix=test_confusion_matrix,class_names=t.get_classes())
  
  return val_weighted_avg_F1, val_kappa, val_PPV['fedulya'], val_TPR['fedulya'], val_F1['fedulya'],test_weighted_avg_F1, test_kappa, test_PPV['fedulya'], test_TPR['fedulya'], test_F1['fedulya']

### Experiment main loop

In [None]:
for index, row in Experiment.iterrows():
  print('Processing fold %s, model %s started...'%(row['fold'], row['Model']))
  if (not(NewExecution) and row['duration'])>0:
    print('%s is already processed. Continue'%(row['Model']))
    continue  
  
  print(row)
  print('---------------------------------------------')

  try:
    ts_start = time.time()
    val_weighted_avg_F1, val_kappa, val_PPV, val_TPR, val_F1, test_weighted_avg_F1, test_kappa, test_PPV, test_TPR, test_F1 = fit_onecycle(row['Model'],row['maxlen'],row['batch_size'],row['lr'],row['epochs'],X_train_data[row['fold']],y_train_data[row['fold']],X_valid_data[row['fold']], y_valid_data[row['fold']],index)                   
    ts_end = time.time()
    Experiment.at[index,'duration']=(ts_end - ts_start)/60  
    Experiment.at[index,'weighted_avg_F1']=val_weighted_avg_F1
    Experiment.at[index,'kappa']=val_kappa
    Experiment.at[index,'fedulya-precision']=val_PPV
    Experiment.at[index,'fedulya-recall']=val_TPR
    Experiment.at[index,'fedulya-f1-score']=val_F1

    Experiment.at[index,'test_weighted_avg_F1']=test_weighted_avg_F1
    Experiment.at[index,'test_kappa']=test_kappa
    Experiment.at[index,'test_fedulya-precision']=test_PPV
    Experiment.at[index,'test_fedulya-recall']=test_TPR
    Experiment.at[index,'test_fedulya-f1-score']=test_F1    
  except:
    raise
    Experiment.at[index,'duration']=10000
    Experiment.at[index,'comment']='Failed'


  #---------------------------Save results to the log------
  try:
    SaveToExperimentLog(Experiments_file, Experiment_name, Experiment)
  except:
    #Continue training even if there is an issue
    print('Error saving to file!')  

Processing fold 0, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 1, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 2, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 3, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 4, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 5, model blinoff/roberta-base-russian-v0 started...
blinoff/roberta-base-russian-v0 is already processed. Continue
Processing fold 6, model blinoff/roberta-base-russian-v0 started...
Model                     blinoff/roberta-base-russian-v0
fold                                                    6
maxlen                        

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

preprocessing train...
language: ru
train sequence lengths:
	mean : 27
	95percentile : 72
	99percentile : 92


Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 28
	95percentile : 72
	99percentile : 92


preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 73
	99percentile : 93


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      0.98      0.98      2875
     fedulya       0.84      0.82      0.83       324

    accuracy                           0.97      3199
   macro avg       0.91      0.90      0.91      3199
weighted avg       0.97      0.97      0.97      3199

              precision    recall  f1-score   support

       Other       0.98      0.98      0.98     12308
     fedulya       0.84      0.82      0.83      1402

    accuracy                           0.97     13710
   macro avg       0.91      0.90      0.91     13710
weighted avg       0.97      0.97      0.97     13710

Processing fold 7, model blinoff/roberta-base-russian-v0 started...
Model                     blinoff/roberta-base-russian-v0
fold                                                    7
maxlen                                           

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 73
	99percentile : 91


preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 73
	99percentile : 93


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      0.98      0.98      2875
     fedulya       0.83      0.86      0.85       323

    accuracy                           0.97      3198
   macro avg       0.91      0.92      0.91      3198
weighted avg       0.97      0.97      0.97      3198

              precision    recall  f1-score   support

       Other       0.98      0.98      0.98     12308
     fedulya       0.83      0.85      0.84      1402

    accuracy                           0.97     13710
   macro avg       0.91      0.92      0.91     13710
weighted avg       0.97      0.97      0.97     13710

Processing fold 8, model blinoff/roberta-base-russian-v0 started...
Model                     blinoff/roberta-base-russian-v0
fold                                                    8
maxlen                                           

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 72
	99percentile : 94


preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 73
	99percentile : 93


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      0.99      0.98      2875
     fedulya       0.89      0.82      0.85       323

    accuracy                           0.97      3198
   macro avg       0.93      0.90      0.92      3198
weighted avg       0.97      0.97      0.97      3198

              precision    recall  f1-score   support

       Other       0.98      0.99      0.98     12308
     fedulya       0.86      0.82      0.84      1402

    accuracy                           0.97     13710
   macro avg       0.92      0.90      0.91     13710
weighted avg       0.97      0.97      0.97     13710

Processing fold 9, model blinoff/roberta-base-russian-v0 started...
Model                     blinoff/roberta-base-russian-v0
fold                                                    9
maxlen                                           

Is Multi-Label? False
preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 72
	99percentile : 92


preprocessing test...
language: ru
test sequence lengths:
	mean : 27
	95percentile : 73
	99percentile : 93


404 Client Error: Not Found for url: https://huggingface.co/blinoff/roberta-base-russian-v0/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 1e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

       Other       0.98      0.99      0.98      2875
     fedulya       0.87      0.78      0.82       323

    accuracy                           0.97      3198
   macro avg       0.92      0.88      0.90      3198
weighted avg       0.96      0.97      0.96      3198

              precision    recall  f1-score   support

       Other       0.98      0.99      0.98     12308
     fedulya       0.87      0.80      0.84      1402

    accuracy                           0.97     13710
   macro avg       0.92      0.90      0.91     13710
weighted avg       0.97      0.97      0.97     13710



In [None]:
result=pd.DataFrame() 
folds = [str(i) for i in range(kfold)]
for model in Experiment['Model'].unique():
  for score in ['fedulya-precision', 'fedulya-recall','fedulya-f1-score','test_fedulya-precision', 'test_fedulya-recall','test_fedulya-f1-score','weighted_avg_F1','kappa','test_weighted_avg_F1','test_kappa']:
    df = Experiment[Experiment['Model']==model][[score]].transpose().reset_index()
    df['Model']=model
    df.columns=['Score']+folds+['Model']
    df=df[['Model','Score'] + folds]
    result=result.append(df)

In [None]:
result.reset_index(inplace=True)
result=result[['Model','Score'] + folds]

In [None]:
result['mean']=result[folds].mean(axis=1)
result['sem']=result[folds].sem(axis=1)
result['std']=result[folds].std(axis=1)
result['median']=result[folds].median(axis=1)

In [None]:
result

Unnamed: 0,Model,Score,0,1,2,3,4,5,6,7,8,9,mean,sem,std,median
0,blinoff/roberta-base-russian-v0,fedulya-precision,0.817073,0.826748,0.87541,0.878289,0.847619,0.841935,0.839117,0.830357,0.889262,0.865517,0.851133,0.007757,0.024529,0.844777
1,blinoff/roberta-base-russian-v0,fedulya-recall,0.829721,0.842105,0.826625,0.826625,0.824074,0.805556,0.820988,0.863777,0.820433,0.77709,0.823699,0.007085,0.022406,0.82535
2,blinoff/roberta-base-russian-v0,fedulya-f1-score,0.823349,0.834356,0.850318,0.851675,0.835681,0.823344,0.829953,0.846737,0.853462,0.818923,0.83678,0.004104,0.012979,0.835018
3,blinoff/roberta-base-russian-v0,test_fedulya-precision,0.839886,0.855876,0.862069,0.867625,0.85767,0.86717,0.841837,0.829285,0.861862,0.869029,0.855231,0.00431,0.013629,0.859766
4,blinoff/roberta-base-russian-v0,test_fedulya-recall,0.838088,0.825963,0.820257,0.818117,0.829529,0.819544,0.823823,0.852354,0.81883,0.804565,0.825107,0.004079,0.012899,0.82204
5,blinoff/roberta-base-russian-v0,test_fedulya-f1-score,0.838986,0.840653,0.840643,0.842144,0.843365,0.842684,0.832733,0.840661,0.839795,0.835556,0.839722,0.00104,0.003287,0.840648
6,blinoff/roberta-base-russian-v0,weighted_avg_F1,0.964174,0.966377,0.970241,0.970536,0.966972,0.964645,0.965762,0.968693,0.971034,0.964456,0.967289,0.000838,0.002651,0.966674
7,blinoff/roberta-base-russian-v0,kappa,0.80334,0.815562,0.834042,0.835576,0.817452,0.803923,0.811022,0.82914,0.837733,0.799791,0.818758,0.004563,0.014428,0.816507
8,blinoff/roberta-base-russian-v0,test_weighted_avg_F1,0.967089,0.967727,0.967847,0.968226,0.968257,0.968316,0.965994,0.967159,0.967689,0.967059,0.967536,0.000229,0.000723,0.967708
9,blinoff/roberta-base-russian-v0,test_kappa,0.820667,0.822861,0.822992,0.824749,0.825853,0.825326,0.813908,0.822233,0.822065,0.817624,0.821828,0.001161,0.003672,0.822547


In [None]:
import scipy.stats as stats
#Significance level
alpha=0.05
confidence = 1 - alpha

In [None]:
result['Shapiro test']=''
result['Confidence Interval (+/-)']=''

In [None]:
for index, row in result.iterrows():
  AnalyzedModelResults=row[folds].values.tolist()
  shapiro_test = stats.shapiro(AnalyzedModelResults)
  shapiro_test_pvalue = shapiro_test[1]
  if shapiro_test_pvalue < alpha:
    result.at[index,'Shapiro test'] = 'The null hypothesis that the score are normally distributed is rejected with alpha=%s'%alpha
  else:
    result.at[index,'Shapiro test'] = 'The score is normally distributed with alpha=%s'%alpha
    result.at[index,'Confidence Interval (+/-)'] = row['sem'] * stats.t.ppf((1 + confidence) / 2., kfold-1)

In [None]:
result

Unnamed: 0,Model,Score,0,1,2,3,4,5,6,7,8,9,mean,sem,std,median,Shapiro test,Confidence Interval (+/-)
0,blinoff/roberta-base-russian-v0,fedulya-precision,0.817073,0.826748,0.87541,0.878289,0.847619,0.841935,0.839117,0.830357,0.889262,0.865517,0.851133,0.007757,0.024529,0.844777,The score is normally distributed with alpha=0.05,0.0175473
1,blinoff/roberta-base-russian-v0,fedulya-recall,0.829721,0.842105,0.826625,0.826625,0.824074,0.805556,0.820988,0.863777,0.820433,0.77709,0.823699,0.007085,0.022406,0.82535,The score is normally distributed with alpha=0.05,0.0160282
2,blinoff/roberta-base-russian-v0,fedulya-f1-score,0.823349,0.834356,0.850318,0.851675,0.835681,0.823344,0.829953,0.846737,0.853462,0.818923,0.83678,0.004104,0.012979,0.835018,The score is normally distributed with alpha=0.05,0.00928455
3,blinoff/roberta-base-russian-v0,test_fedulya-precision,0.839886,0.855876,0.862069,0.867625,0.85767,0.86717,0.841837,0.829285,0.861862,0.869029,0.855231,0.00431,0.013629,0.859766,The score is normally distributed with alpha=0.05,0.00974944
4,blinoff/roberta-base-russian-v0,test_fedulya-recall,0.838088,0.825963,0.820257,0.818117,0.829529,0.819544,0.823823,0.852354,0.81883,0.804565,0.825107,0.004079,0.012899,0.82204,The score is normally distributed with alpha=0.05,0.00922769
5,blinoff/roberta-base-russian-v0,test_fedulya-f1-score,0.838986,0.840653,0.840643,0.842144,0.843365,0.842684,0.832733,0.840661,0.839795,0.835556,0.839722,0.00104,0.003287,0.840648,The score is normally distributed with alpha=0.05,0.00235172
6,blinoff/roberta-base-russian-v0,weighted_avg_F1,0.964174,0.966377,0.970241,0.970536,0.966972,0.964645,0.965762,0.968693,0.971034,0.964456,0.967289,0.000838,0.002651,0.966674,The score is normally distributed with alpha=0.05,0.00189621
7,blinoff/roberta-base-russian-v0,kappa,0.80334,0.815562,0.834042,0.835576,0.817452,0.803923,0.811022,0.82914,0.837733,0.799791,0.818758,0.004563,0.014428,0.816507,The score is normally distributed with alpha=0.05,0.0103214
8,blinoff/roberta-base-russian-v0,test_weighted_avg_F1,0.967089,0.967727,0.967847,0.968226,0.968257,0.968316,0.965994,0.967159,0.967689,0.967059,0.967536,0.000229,0.000723,0.967708,The score is normally distributed with alpha=0.05,0.000517453
9,blinoff/roberta-base-russian-v0,test_kappa,0.820667,0.822861,0.822992,0.824749,0.825853,0.825326,0.813908,0.822233,0.822065,0.817624,0.821828,0.001161,0.003672,0.822547,The score is normally distributed with alpha=0.05,0.00262686


In [None]:
SaveToExperimentLog(Experiments_file, Experiment_name+' Result', result)