In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import warnings
import pandas as pd
import numpy as np
import datetime
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.preprocessing import text
from keras import models, layers
from imblearn.under_sampling import ClusterCentroids, NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE

In [0]:
warnings.filterwarnings('ignore')
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets'
fasttext_data         = pd.read_csv('2020_05_09_encoded_Fasttext.csv', index_col=0)
base_data             = pd.read_csv('papildinatie_jautajumi_cleaned.csv', index_col=0)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded'
bert_big_data         = pd.read_csv('2020_05_10_encoded_BERT_big.csv', index_col=0)
bert_multi_cased_data = pd.read_csv('2020_05_10_encoded_BERT_multi_cased.csv', index_col=0)
bert_small_2_data     = pd.read_csv('2020_05_10_encoded_BERT_small_2.csv', index_col=0)
bert_small_data       = pd.read_csv('2020_05_10_encoded_BERT_small.csv', index_col=0)
print('Papildinato jautājumu kopa: ', base_data.shape, 'FastText iekodētā datu kopa:', fasttext_data.shape)
print('BERT 1 dati:', bert_small_data.shape, 'BERT 2 dati:',  bert_small_2_data.shape, 'BERT 3 dati:', bert_multi_cased_data.shape, 'BERT 4 dati:', bert_big_data.shape)

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets
/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded
Papildinato jautājumu kopa:  (23391, 2) FastText iekodētā datu kopa: (23391, 301)
BERT 1 dati: (23391, 257) BERT 2 dati: (23391, 257) BERT 3 dati: (23391, 769) BERT 4 dati: (23391, 769)


In [0]:
def transformResults(results):            # funkcija, kas pārvērš saņemtus rezultātus, izvēloties populārākās klases indeksu katrā masīva rindā
    mod_results = []
    for result in results:
        out = np.argmax(result)
        mod_results.append(out)
    mod_results = np.array(mod_results)  
    return mod_results 

In [0]:
indices      = range(3)
indices2     = range(3)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5 
results1 = []
header_row = ['Modelis', 'RandomUS_SMOTE', 'NearMiss_SMOTE', 'Cluster_SMOTE', 'RandomUS_BSMOTE', 'NearMiss_BSMOTE', 'Cluster_BSMOTE', 
              'RandomUS_SVMSMOTE', 'NearMiss_SVMSMOTE', 'Cluster_SVMSMOTE']
results1.append(header_row)

In [0]:
bert_big_data['labels'].value_counts()

1    23025
3      122
2      122
0      122
Name: labels, dtype: int64

<h3>300% OverSampling + UnderSampling uz 122 </h3>

In [0]:
class_size = round(122 / 5 * 4) # klases izmērs pēc sadalīšanas K-Foldos
size = 4*class_size             # katrai klasei veicam 300% jaunu piemēru izveidošanu
thisdict = {
  1: size,
  3: size,
  2: size,
  0: size
}
size

392

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print('Reprezentāciju modelis:',model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for k, undersampler in zip( indices2, (RandomUnderSampler(),
                                         NearMiss(),
                                         ClusterCentroids()                                       
                                          )):
    X_res, y_res = undersampler.fit_resample(features, target)                    # Veicam Unersampling
    us_name = undersampler.__class__.__name__
    print('  Pēc', us_name,' UnderSampling, klašu sadalījums:')
    unique, counts = np.unique(y_res, return_counts=True)
    print ('  ', np.asarray((unique, counts)))
    
    for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                         BorderlineSMOTE(sampling_strategy=thisdict),
                                         SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
      os_name = oversampler.__class__.__name__
      print('    ', os_name) 
      model_f1_scores = []
      fold_counter = 0
      for train_index,test_index in StratifiedKFold(n_split).split(X_res, y_res):     # 5-Fold validācija   
        x_train, x_test  = X_res[train_index], X_res[test_index]                      # Izveidojam jautājumu kopas sadalījumu
        y_train, y_test  = y_res[train_index], y_res[test_index]                      # Izveidojam kategorijas kopas sadalījumu
        
        X_res2, y_res2 = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

        model = models.Sequential()                                                   # Izveidojam Neironu tīkla modeli
        model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(4, activation='softmax'))
        model.compile(loss=['sparse_categorical_crossentropy'],
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])
        history = model.fit(X_res2, y_res2,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                            batch_size=batch_size,epochs=epochs,verbose=0)
        y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
        y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

        model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
        model_f1_scores.append(model_f1_avg)
        fold_counter = fold_counter + 1
        print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  
      model_f1 = round(np.mean(model_f1_scores), 4)
      result_row.append(model_f1)
      print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all three OverSamplers and all three UnderSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_mixed_Under/OverSampling_results_300.csv')
result_data     

Reprezentāciju modelis: BERT small #2
  Pēc RandomUnderSampler  UnderSampling, klašu sadalījums:
   [[  0   1   2   3]
 [122 122 122 122]]
     SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7731
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7668
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.7511
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7376
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.6553
	Completed for: BERT small #2  with F1 score: 0.7368
     BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7747
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7858
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.7431
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7467
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.6944
	Completed for: BERT small #2  with F1 score: 0.7489
     SVMSMOTE
		Completed for model:  BERT small #2  

FileNotFoundError: ignored

In [0]:
result_data.to_csv('2020_05_23_mixed_Under_OverSampling_results_300.csv')
result_data

Unnamed: 0,Modelis,RandomUS_SMOTE,NearMiss_SMOTE,Cluster_SMOTE,RandomUS_BSMOTE,NearMiss_BSMOTE,Cluster_BSMOTE,RandomUS_SVMSMOTE,NearMiss_SVMSMOTE,Cluster_SVMSMOTE
0,BERT small #2,0.7368,0.7489,0.7198,0.7739,0.7375,0.7487,0.8126,0.7569,0.7772
1,BERT small,0.8319,0.8329,0.8256,0.7928,0.8089,0.8249,0.8935,0.875,0.8896
2,BERT multi cased,0.7896,0.7759,0.8179,0.8722,0.8393,0.8737,0.8853,0.8667,0.8469
3,BERT big,0.8721,0.8635,0.8623,0.8675,0.8641,0.8438,0.9239,0.9298,0.9179
4,FastText,0.8789,0.8758,0.8958,0.9363,0.7361,0.8941,0.9075,0.8601,0.9102


<h3>200% OverSampling + UnderSampling uz 122 </h3>

In [0]:
class_size = round(122 / 5 * 4) # klases izmērs pēc sadalīšanas K-Foldos
size = 3*class_size             # katrai klasei veicam 300% jaunu piemēru izveidošanu
thisdict = {
  1: size,
  3: size,
  2: size,
  0: size
}
size

294

In [0]:
indices      = range(3)
indices2     = range(3)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5 
results1 = []
header_row = ['Modelis', 'RandomUS_SMOTE', 'NearMiss_SMOTE', 'Cluster_SMOTE', 'RandomUS_BSMOTE', 'NearMiss_BSMOTE', 'Cluster_BSMOTE', 
              'RandomUS_SVMSMOTE', 'NearMiss_SVMSMOTE', 'Cluster_SVMSMOTE']
results1.append(header_row)

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print('Reprezentāciju modelis:',model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for k, undersampler in zip( indices2, (RandomUnderSampler(),
                                         NearMiss(),
                                         ClusterCentroids()                                       
                                          )):
    X_res, y_res = undersampler.fit_resample(features, target)                    # Veicam Unersampling
    us_name = undersampler.__class__.__name__
    print('  Pēc', us_name,' UnderSampling, klašu sadalījums:')
    unique, counts = np.unique(y_res, return_counts=True)
    print ('  ', np.asarray((unique, counts)))
    
    for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                         BorderlineSMOTE(sampling_strategy=thisdict),
                                         SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
      os_name = oversampler.__class__.__name__
      print('    ', os_name) 
      model_f1_scores = []
      fold_counter = 0
      for train_index,test_index in StratifiedKFold(n_split).split(X_res, y_res):     # 5-Fold validācija   
        x_train, x_test  = X_res[train_index], X_res[test_index]                      # Izveidojam jautājumu kopas sadalījumu
        y_train, y_test  = y_res[train_index], y_res[test_index]                      # Izveidojam kategorijas kopas sadalījumu
        
        X_res2, y_res2 = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

        model = models.Sequential()                                                   # Izveidojam Neironu tīkla modeli
        model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(4, activation='softmax'))
        model.compile(loss=['sparse_categorical_crossentropy'],
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])
        history = model.fit(X_res2, y_res2,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                            batch_size=batch_size,epochs=epochs,verbose=0)
        y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
        y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

        model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
        model_f1_scores.append(model_f1_avg)
        fold_counter = fold_counter + 1
        print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  
      model_f1 = round(np.mean(model_f1_scores), 4)
      result_row.append(model_f1)
      print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all three OverSamplers and all three UnderSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_mixed_Under_OverSampling_results_200.csv')
result_data     

Reprezentāciju modelis: BERT small #2
  Pēc RandomUnderSampler  UnderSampling, klašu sadalījums:
   [[  0   1   2   3]
 [122 122 122 122]]
     SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7211
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7319
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6825
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.6813
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.6692
	Completed for: BERT small #2  with F1 score: 0.6972
     BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7497
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7643
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6906
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7554
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.6749
	Completed for: BERT small #2  with F1 score: 0.727
     SVMSMOTE
		Completed for model:  BERT small #2  K

Unnamed: 0,Modelis,RandomUS_SMOTE,NearMiss_SMOTE,Cluster_SMOTE,RandomUS_BSMOTE,NearMiss_BSMOTE,Cluster_BSMOTE,RandomUS_SVMSMOTE,NearMiss_SVMSMOTE,Cluster_SVMSMOTE
0,BERT small #2,0.6972,0.727,0.6952,0.7339,0.7628,0.747,0.7764,0.7288,0.7556
1,BERT small,0.8017,0.813,0.8236,0.813,0.8045,0.8183,0.8759,0.8803,0.8802
2,BERT multi cased,0.811,0.805,0.8068,0.8613,0.8723,0.8582,0.8804,0.8555,0.8628
3,BERT big,0.8711,0.8582,0.88,0.8636,0.8556,0.8836,0.9174,0.9299,0.9213
4,FastText,0.8847,0.8791,0.8777,0.9135,0.622,0.8994,0.8923,0.8328,0.9119


<h3>100% OverSampling + UnderSampling uz 122 </h3>

In [0]:
class_size = round(122 / 5 * 4) # klases izmērs pēc sadalīšanas K-Foldos
size = 2*class_size             # katrai klasei veicam 300% jaunu piemēru izveidošanu
thisdict = {
  1: size,
  3: size,
  2: size,
  0: size
}
size

196

In [0]:
indices      = range(3)
indices2     = range(3)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5 
results1 = []
header_row = ['Modelis', 'RandomUS_SMOTE', 'NearMiss_SMOTE', 'Cluster_SMOTE', 'RandomUS_BSMOTE', 'NearMiss_BSMOTE', 'Cluster_BSMOTE', 
              'RandomUS_SVMSMOTE', 'NearMiss_SVMSMOTE', 'Cluster_SVMSMOTE']
results1.append(header_row)

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print('Reprezentāciju modelis:',model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for k, undersampler in zip( indices2, (RandomUnderSampler(),
                                         NearMiss(),
                                         ClusterCentroids()                                       
                                          )):
    X_res, y_res = undersampler.fit_resample(features, target)                    # Veicam Unersampling
    us_name = undersampler.__class__.__name__
    print('  Pēc', us_name,' UnderSampling, klašu sadalījums:')
    unique, counts = np.unique(y_res, return_counts=True)
    print ('  ', np.asarray((unique, counts)))
    
    for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                         BorderlineSMOTE(sampling_strategy=thisdict),
                                         SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
      os_name = oversampler.__class__.__name__
      print('    ', os_name) 
      model_f1_scores = []
      fold_counter = 0
      for train_index,test_index in StratifiedKFold(n_split).split(X_res, y_res):     # 5-Fold validācija   
        x_train, x_test  = X_res[train_index], X_res[test_index]                      # Izveidojam jautājumu kopas sadalījumu
        y_train, y_test  = y_res[train_index], y_res[test_index]                      # Izveidojam kategorijas kopas sadalījumu
        
        X_res2, y_res2 = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

        model = models.Sequential()                                                   # Izveidojam Neironu tīkla modeli
        model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(4, activation='softmax'))
        model.compile(loss=['sparse_categorical_crossentropy'],
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])
        history = model.fit(X_res2, y_res2,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                            batch_size=batch_size,epochs=epochs,verbose=0)
        y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
        y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

        model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
        model_f1_scores.append(model_f1_avg)
        fold_counter = fold_counter + 1
        print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  
      model_f1 = round(np.mean(model_f1_scores), 4)
      result_row.append(model_f1)
      print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all three OverSamplers and all three UnderSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_mixed_Under_OverSampling_results_100.csv')
result_data     

Reprezentāciju modelis: BERT small #2
  Pēc RandomUnderSampler  UnderSampling, klašu sadalījums:
   [[  0   1   2   3]
 [122 122 122 122]]
     SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7127
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7211
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.7778
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7308
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.5649
	Completed for: BERT small #2  with F1 score: 0.7015
     BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.74
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.7944
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.7546
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.6505
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.5848
	Completed for: BERT small #2  with F1 score: 0.7049
     SVMSMOTE
		Completed for model:  BERT small #2  K-

Unnamed: 0,Modelis,RandomUS_SMOTE,NearMiss_SMOTE,Cluster_SMOTE,RandomUS_BSMOTE,NearMiss_BSMOTE,Cluster_BSMOTE,RandomUS_SVMSMOTE,NearMiss_SVMSMOTE,Cluster_SVMSMOTE
0,BERT small #2,0.7015,0.7049,0.7178,0.7346,0.7258,0.7099,0.7379,0.691,0.7281
1,BERT small,0.8002,0.783,0.7826,0.842,0.8306,0.8299,0.8729,0.8448,0.8765
2,BERT multi cased,0.7683,0.791,0.7748,0.851,0.8407,0.8629,0.8461,0.8381,0.8216
3,BERT big,0.8298,0.8593,0.8734,0.8518,0.8692,0.8664,0.919,0.93,0.9317
4,FastText,0.8477,0.8711,0.8696,0.9007,0.6119,0.9096,0.8841,0.8449,0.8355


<h3>400% OverSampling + UnderSampling uz 122 </h3>

In [0]:
class_size = round(122 / 5 * 4) # klases izmērs pēc sadalīšanas K-Foldos
size = 5*class_size             # katrai klasei veicam 300% jaunu piemēru izveidošanu
thisdict = {
  1: size,
  3: size,
  2: size,
  0: size
}
size

490

In [0]:
indices      = range(3)
indices2     = range(3)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5 
results1 = []
header_row = ['Modelis', 'RandomUS_SMOTE', 'NearMiss_SMOTE', 'Cluster_SMOTE', 'RandomUS_BSMOTE', 'NearMiss_BSMOTE', 'Cluster_BSMOTE', 
              'RandomUS_SVMSMOTE', 'NearMiss_SVMSMOTE', 'Cluster_SVMSMOTE']
results1.append(header_row)

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print('Reprezentāciju modelis:',model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for k, undersampler in zip( indices2, (RandomUnderSampler(),
                                         NearMiss(),
                                         ClusterCentroids()                                       
                                          )):
    X_res, y_res = undersampler.fit_resample(features, target)                    # Veicam Unersampling
    us_name = undersampler.__class__.__name__
    print('  Pēc', us_name,' UnderSampling, klašu sadalījums:')
    unique, counts = np.unique(y_res, return_counts=True)
    print ('  ', np.asarray((unique, counts)))
    
    for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                         BorderlineSMOTE(sampling_strategy=thisdict),
                                         SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
      os_name = oversampler.__class__.__name__
      print('    ', os_name) 
      model_f1_scores = []
      fold_counter = 0
      for train_index,test_index in StratifiedKFold(n_split).split(X_res, y_res):     # 5-Fold validācija   
        x_train, x_test  = X_res[train_index], X_res[test_index]                      # Izveidojam jautājumu kopas sadalījumu
        y_train, y_test  = y_res[train_index], y_res[test_index]                      # Izveidojam kategorijas kopas sadalījumu
        
        X_res2, y_res2 = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

        model = models.Sequential()                                                   # Izveidojam Neironu tīkla modeli
        model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(4, activation='softmax'))
        model.compile(loss=['sparse_categorical_crossentropy'],
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])
        history = model.fit(X_res2, y_res2,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                            batch_size=batch_size,epochs=epochs,verbose=0)
        y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
        y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

        model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
        model_f1_scores.append(model_f1_avg)
        fold_counter = fold_counter + 1
        print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  
      model_f1 = round(np.mean(model_f1_scores), 4)
      result_row.append(model_f1)
      print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all three OverSamplers and all three UnderSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_mixed_Under_OverSampling_results_400.csv')
result_data     

Reprezentāciju modelis: BERT small #2
  Pēc RandomUnderSampler  UnderSampling, klašu sadalījums:
   [[  0   1   2   3]
 [122 122 122 122]]
     SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7629
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.6863
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.8097
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7588
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.7934
	Completed for: BERT small #2  with F1 score: 0.7622
     BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.796
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.6966
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.769
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7251
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.7381
	Completed for: BERT small #2  with F1 score: 0.745
     SVMSMOTE
		Completed for model:  BERT small #2  K-F

Unnamed: 0,Modelis,RandomUS_SMOTE,NearMiss_SMOTE,Cluster_SMOTE,RandomUS_BSMOTE,NearMiss_BSMOTE,Cluster_BSMOTE,RandomUS_SVMSMOTE,NearMiss_SVMSMOTE,Cluster_SVMSMOTE
0,BERT small #2,0.7622,0.745,0.7613,0.7687,0.7637,0.7391,0.8288,0.7374,0.7923
1,BERT small,0.8298,0.8506,0.8542,0.8224,0.8111,0.8477,0.8913,0.8828,0.8783
2,BERT multi cased,0.7963,0.7944,0.7869,0.8523,0.856,0.8682,0.8646,0.8754,0.8373
3,BERT big,0.8825,0.8705,0.8645,0.8615,0.8522,0.8658,0.9243,0.9262,0.9111
4,FastText,0.8893,0.8709,0.8958,0.9388,0.7696,0.8963,0.9088,0.8865,0.9133


<h3>300% OverSampling + UnderSampling uz 1000 </h3>

In [0]:
class_size = round(122 / 5 * 4) # klases izmērs pēc sadalīšanas K-Foldos
big_size   = round(1000 / 5 * 4)
size = 4*class_size             # katrai klasei veicam 300% jaunu piemēru izveidošanu
thisdict = {
  1: big_size,
  3: size,
  2: size,
  0: size
}
size, big_size

(392, 800)

In [0]:
bert_big_data['labels'].value_counts()

1    23025
3      122
2      122
0      122
Name: labels, dtype: int64

In [0]:
thisdict2 = {
  1: 1000,
  3: 122,
  2: 122,
  0: 122
}

In [0]:
indices      = range(3)
indices2     = range(3)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5 
results1 = []
header_row = ['Modelis', 'RandomUS_SMOTE', 'NearMiss_SMOTE', 'Cluster_SMOTE', 'RandomUS_BSMOTE', 'NearMiss_BSMOTE', 'Cluster_BSMOTE', 
              'RandomUS_SVMSMOTE', 'NearMiss_SVMSMOTE', 'Cluster_SVMSMOTE']
results1.append(header_row)

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print('Reprezentāciju modelis:',model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for k, undersampler in zip( indices2, (RandomUnderSampler(sampling_strategy=thisdict2),
                                         NearMiss(sampling_strategy=thisdict2),
                                         ClusterCentroids(sampling_strategy=thisdict2)                                       
                                          )):
    X_res, y_res = undersampler.fit_resample(features, target)                    # Veicam Unersampling
    us_name = undersampler.__class__.__name__
    print('  Pēc', us_name,' UnderSampling, klašu sadalījums:')
    unique, counts = np.unique(y_res, return_counts=True)
    print ('  ', np.asarray((unique, counts)))
    
    for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                         BorderlineSMOTE(sampling_strategy=thisdict),
                                         SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
      os_name = oversampler.__class__.__name__
      print('    ', os_name) 
      model_f1_scores = []
      fold_counter = 0
      for train_index,test_index in StratifiedKFold(n_split).split(X_res, y_res):     # 5-Fold validācija   
        x_train, x_test  = X_res[train_index], X_res[test_index]                      # Izveidojam jautājumu kopas sadalījumu
        y_train, y_test  = y_res[train_index], y_res[test_index]                      # Izveidojam kategorijas kopas sadalījumu
        
        X_res2, y_res2 = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

        model = models.Sequential()                                                   # Izveidojam Neironu tīkla modeli
        model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(4, activation='softmax'))
        model.compile(loss=['sparse_categorical_crossentropy'],
                      optimizer='adam',
                      metrics=['sparse_categorical_accuracy'])
        history = model.fit(X_res2, y_res2,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                            batch_size=batch_size,epochs=epochs,verbose=0)
        y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
        y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

        model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
        model_f1_scores.append(model_f1_avg)
        fold_counter = fold_counter + 1
        print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  
      model_f1 = round(np.mean(model_f1_scores), 4)
      result_row.append(model_f1)
      print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all three OverSamplers and all three UnderSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_mixed_Under_OverSampling_results_300_5000.csv')
result_data     

Reprezentāciju modelis: BERT small #2
  Pēc RandomUnderSampler  UnderSampling, klašu sadalījums:
   [[   0    1    2    3]
 [ 122 1000  122  122]]
     SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.769
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.731
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.7836
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7702
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.7868
	Completed for: BERT small #2  with F1 score: 0.7681
     BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.7946
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.693
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.8238
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7439
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.7883
	Completed for: BERT small #2  with F1 score: 0.7687
     SVMSMOTE
		Completed for model:  BERT small