In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import warnings
import pandas as pd
import numpy as np
import datetime
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.preprocessing import text
from keras import models, layers
from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, ADASYN

In [0]:
warnings.filterwarnings('ignore')
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets'
fasttext_data         = pd.read_csv('2020_05_09_encoded_Fasttext.csv', index_col=0)
base_data             = pd.read_csv('papildinatie_jautajumi_cleaned.csv', index_col=0)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded'
bert_big_data         = pd.read_csv('2020_05_10_encoded_BERT_big.csv', index_col=0)
bert_multi_cased_data = pd.read_csv('2020_05_10_encoded_BERT_multi_cased.csv', index_col=0)
bert_small_2_data     = pd.read_csv('2020_05_10_encoded_BERT_small_2.csv', index_col=0)
bert_small_data       = pd.read_csv('2020_05_10_encoded_BERT_small.csv', index_col=0)
print('Papildinato jautājumu kopa: ', base_data.shape, 'FastText iekodētā datu kopa:', fasttext_data.shape)
print('BERT 1 dati:', bert_small_data.shape, 'BERT 2 dati:',  bert_small_2_data.shape, 'BERT 3 dati:', bert_multi_cased_data.shape, 'BERT 4 dati:', bert_big_data.shape)

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets
/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded
Papildinato jautājumu kopa:  (23391, 2) FastText iekodētā datu kopa: (23391, 301)
BERT 1 dati: (23391, 257) BERT 2 dati: (23391, 257) BERT 3 dati: (23391, 769) BERT 4 dati: (23391, 769)


In [0]:
def transformResults(results):            # funkcija, kas pārvērš saņemtus rezultātus, izvēloties populārākās klases indeksu katrā masīva rindā
    mod_results = []
    for result in results:
        out = np.argmax(result)
        mod_results.append(out)
    mod_results = np.array(mod_results)  
    return mod_results 

<h3>100% OverSampling</h3>

In [0]:
indices       = range(4)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'SVMSMOTE', 'BorderlineSMOTE', 'ADASYN']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 2*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

196

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print(model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict),
                                       SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
    os_name = oversampler.__class__.__name__
    print('  ', os_name)
    model_f1_scores = []
    fold_counter = 0
    for train_index,test_index in StratifiedKFold(n_split).split(features, target): # 5-Fold validācija 
      x_train, x_test  = features[train_index], features[test_index]                 # Izveidojam jautājumu kopas sadalījumu
      y_train, y_test  = target[train_index], target[test_index]                     # Izveidojam kategorijas kopas sadalījumu
        
      X_res, y_res = oversampler.fit_resample(x_train, y_train)                     # Veicam Oversampling
      
      model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
      model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
      model.add(layers.Dense(64, activation='relu'))
      model.add(layers.Dense(4, activation='softmax'))
      model.compile(loss=['sparse_categorical_crossentropy'],
                    optimizer='adam',
                    metrics=['sparse_categorical_accuracy'])
      history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                          batch_size=batch_size,epochs=epochs,verbose=0)
      y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
      y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

      model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
      model_f1_scores.append(model_f1_avg)
      fold_counter = fold_counter + 1
      print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
    # After K-Fold validation
    model_f1 = round(np.mean(model_f1_scores), 4)
    result_row.append(model_f1)
    print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all four OverSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_20_OverSampling_results_1.csv')
result_data     

BERT small #2
   SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4984
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4475
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6605
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7027
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.5317
	Completed for: BERT small #2  with F1 score: 0.5682
   BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.494
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4482
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6602
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.5879
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.4756
	Completed for: BERT small #2  with F1 score: 0.5332
   SVMSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.372
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4152
		Completed for model:  BERT small #2 

ValueError: ignored

In [0]:
header_row = ['Modelis', 'SMOTE', 'SVMSMOTE', 'BorderlineSMOTE']
result_data = pd.DataFrame(results1[1:], columns=header_row)
result_data.to_csv('2020_05_20_OverSampling_results_1.csv')
result_data 

Unnamed: 0,Modelis,SMOTE,SVMSMOTE,BorderlineSMOTE
0,BERT small #2,0.5682,0.5332,0.5227
1,BERT small,0.6254,0.6265,0.5799
2,BERT multi cased,0.603,0.5629,0.6001
3,BERT big,0.6447,0.6674,0.672
4,FastText,0.6787,0.6934,0.6689


<h3>200% OverSampling</h3>

In [0]:
indices       = range(4)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'SVMSMOTE', 'BorderlineSMOTE']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 3*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

294

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print(model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict),
                                       SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
    os_name = oversampler.__class__.__name__
    print('  ', os_name)
    model_f1_scores = []
    fold_counter = 0
    for train_index,test_index in StratifiedKFold(n_split).split(features, target): # 5-Fold validācija 
      x_train, x_test  = features[train_index], features[test_index]                 # Izveidojam jautājumu kopas sadalījumu
      y_train, y_test  = target[train_index], target[test_index]                     # Izveidojam kategorijas kopas sadalījumu
        
      X_res, y_res = oversampler.fit_resample(x_train, y_train)                     # Veicam Oversampling
      
      model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
      model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
      model.add(layers.Dense(64, activation='relu'))
      model.add(layers.Dense(4, activation='softmax'))
      model.compile(loss=['sparse_categorical_crossentropy'],
                    optimizer='adam',
                    metrics=['sparse_categorical_accuracy'])
      history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                          batch_size=batch_size,epochs=epochs,verbose=0)
      y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
      y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

      model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
      model_f1_scores.append(model_f1_avg)
      fold_counter = fold_counter + 1
      print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
    # After K-Fold validation
    model_f1 = round(np.mean(model_f1_scores), 4)
    result_row.append(model_f1)
    print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all four OverSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_20_OverSampling_results_200.csv')
result_data     

BERT small #2
   SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.5071
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4975
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6211
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.6907
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.4552
	Completed for: BERT small #2  with F1 score: 0.5543
   BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.475
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4852
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6822
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.6683
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.4496
	Completed for: BERT small #2  with F1 score: 0.5521
   SVMSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4951
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4799
		Completed for model:  BERT small #2

Unnamed: 0,Modelis,SMOTE,SVMSMOTE,BorderlineSMOTE
0,BERT small #2,0.5543,0.5521,0.5631
1,BERT small,0.6142,0.6284,0.6054
2,BERT multi cased,0.6106,0.5903,0.6358
3,BERT big,0.6696,0.6517,0.6401
4,FastText,0.6566,0.6794,0.695


<h3>300% OverSampling</h3>

In [0]:
indices       = range(4)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'SVMSMOTE', 'BorderlineSMOTE']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 4*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

392

In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print(model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict),
                                       SVMSMOTE(sampling_strategy=thisdict)                                     
                                        )):
    os_name = oversampler.__class__.__name__
    print('  ', os_name)
    model_f1_scores = []
    fold_counter = 0
    for train_index,test_index in StratifiedKFold(n_split).split(features, target): # 5-Fold validācija 
      x_train, x_test  = features[train_index], features[test_index]                 # Izveidojam jautājumu kopas sadalījumu
      y_train, y_test  = target[train_index], target[test_index]                     # Izveidojam kategorijas kopas sadalījumu
        
      X_res, y_res = oversampler.fit_resample(x_train, y_train)                     # Veicam Oversampling
      
      model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
      model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
      model.add(layers.Dense(64, activation='relu'))
      model.add(layers.Dense(4, activation='softmax'))
      model.compile(loss=['sparse_categorical_crossentropy'],
                    optimizer='adam',
                    metrics=['sparse_categorical_accuracy'])
      history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                          batch_size=batch_size,epochs=epochs,verbose=0)
      y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
      y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

      model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
      model_f1_scores.append(model_f1_avg)
      fold_counter = fold_counter + 1
      print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
    # After K-Fold validation
    model_f1 = round(np.mean(model_f1_scores), 4)
    result_row.append(model_f1)
    print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  #After all four OverSamplers
  results1.append(result_row)
  print('Completed for ', model_names[counter], ' with results=', result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_20_OverSampling_results_200.csv')
result_data     

BERT small #2
   SMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4196
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4777
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6282
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.7165
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.5443
	Completed for: BERT small #2  with F1 score: 0.5573
   BorderlineSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4659
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.4894
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6894
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.6551
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.4392
	Completed for: BERT small #2  with F1 score: 0.5478
   SVMSMOTE
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.5173
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.5354
		Completed for model:  BERT small #

Unnamed: 0,Modelis,SMOTE,SVMSMOTE,BorderlineSMOTE
0,BERT small #2,0.5573,0.5478,0.5949
1,BERT small,0.609,0.6076,0.6214
2,BERT multi cased,0.6223,0.612,0.6208
3,BERT big,0.6504,0.6686,0.6429
4,FastText,0.6783,0.6887,0.6994


<h3>Bāzlīnija 100% 

In [0]:
col_count = len(base_data.columns)
features = base_data['Question'].to_numpy().astype('str')
target = base_data['labels']

In [0]:
max_words = 2000
# Text pre-processing
tokenize = text.Tokenizer(num_words=max_words, char_level=False)                # izveidojam Tokenizer ar noteikto vārdnīcas izmēru
tokenize.fit_on_texts(features)                                                 # apmācām Tokenizer uz mūsu datiem
X = tokenize.texts_to_matrix(features)                                          # pārveidojam treniņdatus par tekstvienību vektoriem
print('Tokenizer done:', max_words, X.shape)

Tokenizer done: 2000 (23391, 2000)


In [0]:
indices       = range(2)
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'BorderlineSMOTE']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 2*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

196

In [0]:
exec_start = datetime.datetime.now()
keras_row = ['Keras Baseline']
naive_row = ['Naive Bayes']
for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict)                                    
                                        )):
  os_name = oversampler.__class__.__name__
  print(os_name)
  f1_scores_naive = []                                                          # Creating an empty array of f1 scores 
  f1_scores_keras = []
  fold_counter = 0
  for train_index,test_index in StratifiedKFold(n_split).split(X, target):      # 5-Fold validācija 
    x_train, x_test  = X[train_index], X[test_index]                            # Izveidojam jautājumu kopas sadalījumu
    y_train, y_test  = target[train_index], target[test_index]                  # Izveidojam kategorijas kopas sadalījumu
        
    X_res, y_res = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

    ### Naive Bayes ###
    naive_bayes = MultinomialNB()                                               # Izveidojam Naive Bayes modeli
    naive_bayes.fit(X_res, y_res)                                               # Trenējam modeli uz treniņdatiem
    naive_y_pred = naive_bayes.predict(x_test)
    naive_f1_avg = round(f1_score(y_test, naive_y_pred, average='macro'),4)     # Fiksējam Naive Bayes f1 vidējo rezultātu
    f1_scores_naive.append(naive_f1_avg)

    ### Keras Sequential Baseline ###  
    model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
    model.add(layers.Dense(256, activation='relu', input_shape=(max_words,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(loss=['sparse_categorical_crossentropy'],
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])
    history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                        batch_size=batch_size,epochs=epochs,verbose=0)
    y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
    y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

    model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
    f1_scores_keras.append(model_f1_avg)
    fold_counter = fold_counter + 1
    print('\tCompleted for:', os_name,' K-Fold #',counter, ' Naive Bayes score:',naive_f1_avg, ' Keras score:',model_f1_avg)
  # After K-Fold validation
  f1_avg_naive = round(np.mean(f1_scores_naive), 4)
  f1_avg_keras = round(np.mean(f1_scores_keras),4)
  keras_row.append(f1_avg_keras)
  naive_row.append(f1_avg_naive)

#After all two OverSamplers
results1.append(keras_row)
results1.append(naive_row)
#In the end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_22_OverSampling_base_results_100.csv')
result_data     

SMOTE
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5596  Keras score: 0.614
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5086  Keras score: 0.5088
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.6199  Keras score: 0.7129
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5911  Keras score: 0.7674
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5527  Keras score: 0.6567
BorderlineSMOTE
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5442  Keras score: 0.5686
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5028  Keras score: 0.5396
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.623  Keras score: 0.7305
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.6228  Keras score: 0.7791
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5431  Keras score: 0.6688
Total time elapsed: 0:05:36.934052


Unnamed: 0,Modelis,SMOTE,BorderlineSMOTE
0,Keras Baseline,0.652,0.6573
1,Naive Bayes,0.5664,0.5672


<h3>Bāzlīnijas 200% </h3>

In [0]:
indices       = range(2)
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'BorderlineSMOTE']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 3*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

294

In [0]:
exec_start = datetime.datetime.now()
keras_row = ['Keras Baseline']
naive_row = ['Naive Bayes']
for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict)                                    
                                        )):
  os_name = oversampler.__class__.__name__
  print(os_name)
  f1_scores_naive = []                                                          # Creating an empty array of f1 scores 
  f1_scores_keras = []
  fold_counter = 0
  for train_index,test_index in StratifiedKFold(n_split).split(X, target):      # 5-Fold validācija 
    x_train, x_test  = X[train_index], X[test_index]                            # Izveidojam jautājumu kopas sadalījumu
    y_train, y_test  = target[train_index], target[test_index]                  # Izveidojam kategorijas kopas sadalījumu
        
    X_res, y_res = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

    ### Naive Bayes ###
    naive_bayes = MultinomialNB()                                               # Izveidojam Naive Bayes modeli
    naive_bayes.fit(X_res, y_res)                                               # Trenējam modeli uz treniņdatiem
    naive_y_pred = naive_bayes.predict(x_test)
    naive_f1_avg = round(f1_score(y_test, naive_y_pred, average='macro'),4)     # Fiksējam Naive Bayes f1 vidējo rezultātu
    f1_scores_naive.append(naive_f1_avg)

    ### Keras Sequential Baseline ###  
    model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
    model.add(layers.Dense(256, activation='relu', input_shape=(max_words,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(loss=['sparse_categorical_crossentropy'],
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])
    history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                        batch_size=batch_size,epochs=epochs,verbose=0)
    y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
    y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

    model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
    f1_scores_keras.append(model_f1_avg)
    fold_counter = fold_counter + 1
    print('\tCompleted for:', os_name,' K-Fold #',counter, ' Naive Bayes score:',naive_f1_avg, ' Keras score:',model_f1_avg)
  # After K-Fold validation
  f1_avg_naive = round(np.mean(f1_scores_naive), 4)
  f1_avg_keras = round(np.mean(f1_scores_keras),4)
  keras_row.append(f1_avg_keras)
  naive_row.append(f1_avg_naive)

#After all two OverSamplers
results1.append(keras_row)
results1.append(naive_row)
#In the end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_22_OverSampling_base_results_200.csv')
result_data     

SMOTE
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5284  Keras score: 0.5727
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5326  Keras score: 0.4895
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.6275  Keras score: 0.6877
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.6014  Keras score: 0.7772
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5367  Keras score: 0.6739
BorderlineSMOTE
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5486  Keras score: 0.5849
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5247  Keras score: 0.5048
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.6244  Keras score: 0.7208
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.626  Keras score: 0.7633
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5778  Keras score: 0.696
Total time elapsed: 0:05:41.311675


Unnamed: 0,Modelis,SMOTE,BorderlineSMOTE
0,Keras Baseline,0.6402,0.654
1,Naive Bayes,0.5653,0.5803


<h3>Bāzlīnijas 300% </h3>

In [0]:
indices       = range(2)
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'SMOTE', 'BorderlineSMOTE']
results1.append(header_row)

In [0]:
class_size = round(122 / 5 * 4)
size = 4*class_size
thisdict = {
  1: 18420,
  3: size,
  2: size,
  0: size
}
size

392

In [0]:
exec_start = datetime.datetime.now()
keras_row = ['Keras Baseline']
naive_row = ['Naive Bayes']
for i, oversampler in zip( indices, (SMOTE(sampling_strategy=thisdict),
                                       BorderlineSMOTE(sampling_strategy=thisdict)                                    
                                        )):
  os_name = oversampler.__class__.__name__
  print(os_name)
  f1_scores_naive = []                                                          # Creating an empty array of f1 scores 
  f1_scores_keras = []
  fold_counter = 0
  for train_index,test_index in StratifiedKFold(n_split).split(X, target):      # 5-Fold validācija 
    x_train, x_test  = X[train_index], X[test_index]                            # Izveidojam jautājumu kopas sadalījumu
    y_train, y_test  = target[train_index], target[test_index]                  # Izveidojam kategorijas kopas sadalījumu
        
    X_res, y_res = oversampler.fit_resample(x_train, y_train)                   # Veicam Oversampling

    ### Naive Bayes ###
    naive_bayes = MultinomialNB()                                               # Izveidojam Naive Bayes modeli
    naive_bayes.fit(X_res, y_res)                                               # Trenējam modeli uz treniņdatiem
    naive_y_pred = naive_bayes.predict(x_test)
    naive_f1_avg = round(f1_score(y_test, naive_y_pred, average='macro'),4)     # Fiksējam Naive Bayes f1 vidējo rezultātu
    f1_scores_naive.append(naive_f1_avg)

    ### Keras Sequential Baseline ###  
    model = models.Sequential()                                                 # Izveidojam Neironu tīkla modeli
    model.add(layers.Dense(256, activation='relu', input_shape=(max_words,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(loss=['sparse_categorical_crossentropy'],
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])
    history = model.fit(X_res, y_res,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                        batch_size=batch_size,epochs=epochs,verbose=0)
    y_pred = model.predict(x_test)                                              # Prasam modelim paredzēt testa datiem kategorijas
    y_pred2 = transformResults(y_pred)                                          # Pārveidojam rezultātus

    model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)          # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
    f1_scores_keras.append(model_f1_avg)
    fold_counter = fold_counter + 1
    print('\tCompleted for:', os_name,' K-Fold #',counter, ' Naive Bayes score:',naive_f1_avg, ' Keras score:',model_f1_avg)
  # After K-Fold validation
  f1_avg_naive = round(np.mean(f1_scores_naive), 4)
  f1_avg_keras = round(np.mean(f1_scores_keras),4)
  keras_row.append(f1_avg_keras)
  naive_row.append(f1_avg_naive)

#After all two OverSamplers
results1.append(keras_row)
results1.append(naive_row)
#In the end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_22_OverSampling_base_results_300.csv')
result_data     

SMOTE
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5443  Keras score: 0.6145
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5449  Keras score: 0.5032
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.6316  Keras score: 0.7491
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.596  Keras score: 0.7438
	Completed for: SMOTE  K-Fold # 0  Naive Bayes score: 0.5416  Keras score: 0.6869
BorderlineSMOTE
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5377  Keras score: 0.5822
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.529  Keras score: 0.5241
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.6146  Keras score: 0.7251
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.6008  Keras score: 0.7687
	Completed for: BorderlineSMOTE  K-Fold # 0  Naive Bayes score: 0.5692  Keras score: 0.6838
Total time elapsed: 0:05:45.111141


Unnamed: 0,Modelis,SMOTE,BorderlineSMOTE
0,Keras Baseline,0.6595,0.6568
1,Naive Bayes,0.5717,0.5703
