In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import warnings
import pandas as pd
import numpy as np
import datetime
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.preprocessing import text
from keras import models, layers

<h3>Nepapildinātie dati</h3>

In [0]:
warnings.filterwarnings('ignore')
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets'
fasttext_data         = pd.read_csv('2020_05_19_encoded_Fasttext_sakotnejie.csv', index_col=0)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded'
bert_big_data         = pd.read_csv('Encoded_labeled_BERT_big.csv', index_col=0)
bert_multi_cased_data = pd.read_csv('Encoded_labeled_BERT_multi_cased.csv', index_col=0)
bert_small_2_data     = pd.read_csv('Encoded_labeled_BERT_small_other.csv', index_col=0)
bert_small_data       = pd.read_csv('Encoded_labeled_BERT_small.csv', index_col=0)
print('FastText iekodētā datu kopa:', fasttext_data.shape)
print('BERT 1 dati:', bert_small_data.shape, 'BERT 2 dati:',  bert_small_2_data.shape, 'BERT 3 dati:', bert_multi_cased_data.shape, 'BERT 4 dati:', bert_big_data.shape)

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets
/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded
FastText iekodētā datu kopa: (23217, 301)
BERT 1 dati: (23217, 257) BERT 2 dati: (23217, 257) BERT 3 dati: (23217, 769) BERT 4 dati: (23217, 769)


In [0]:
def transformResults(results):            # funkcija, kas pārvērš saņemtus rezultātus, izvēloties populārākās klases indeksu katrā masīva rindā
    mod_results = []
    for result in results:
        out = np.argmax(result)
        mod_results.append(out)
    mod_results = np.array(mod_results)  
    return mod_results 

In [0]:
indices       = range(4)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'Vidējais F1']
results1.append(header_row)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Rezultāti'

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Rezultāti


In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print(model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  model_f1_scores = []
  fold_counter = 0
  for train_index,test_index in StratifiedKFold(n_split).split(features, target):   # 5-Fold validācija 
    x_train, x_test  = features[train_index], features[test_index]                  # Izveidojam jautājumu kopas sadalījumu
    y_train, y_test  = target[train_index], target[test_index]                      # Izveidojam kategorijas kopas sadalījumu
     
    model = models.Sequential()                                                     # Izveidojam Neironu tīkla modeli
    model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(loss=['sparse_categorical_crossentropy'],
                    optimizer='adam',
                    metrics=['sparse_categorical_accuracy'])
    history = model.fit(x_train, y_train,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                        batch_size=batch_size,epochs=epochs,verbose=0)
    y_pred = model.predict(x_test)                                                # Prasam modelim paredzēt testa datiem kategorijas
    y_pred2 = transformResults(y_pred)                                            # Pārveidojam rezultātus

    model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)            # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
    model_f1_scores.append(model_f1_avg)
    fold_counter = fold_counter + 1
    print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  # After K-Fold validation
  model_f1 = round(np.mean(model_f1_scores), 4)
  result_row.append(model_f1)
  print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  results1.append(result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_BERT_FastText_pamatdati.csv')
result_data     

BERT small #2
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4483
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.286
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.2875
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.269
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.39
	Completed for: BERT small #2  with F1 score: 0.3362
BERT small
		Completed for model:  BERT small  K-Fold # 1 F1 score: 0.4384
		Completed for model:  BERT small  K-Fold # 2 F1 score: 0.2893
		Completed for model:  BERT small  K-Fold # 3 F1 score: 0.3347
		Completed for model:  BERT small  K-Fold # 4 F1 score: 0.4604
		Completed for model:  BERT small  K-Fold # 5 F1 score: 0.3741
	Completed for: BERT small  with F1 score: 0.3794
BERT multi cased
		Completed for model:  BERT multi cased  K-Fold # 1 F1 score: 0.3156
		Completed for model:  BERT multi cased  K-Fold # 2 F1 score: 0.3204
		Completed for model:  BERT multi cased  K-Fold # 3 F1 score: 0

Unnamed: 0,Modelis,Vidējais F1
0,BERT small #2,0.3362
1,BERT small,0.3794
2,BERT multi cased,0.3664
3,BERT big,0.4267
4,FastText,0.399


<h2>Papildinātie dati</h2>

In [0]:
warnings.filterwarnings('ignore')
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets'
fasttext_data         = pd.read_csv('2020_05_09_encoded_Fasttext.csv', index_col=0)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded'
bert_big_data         = pd.read_csv('2020_05_10_encoded_BERT_big.csv', index_col=0)
bert_multi_cased_data = pd.read_csv('2020_05_10_encoded_BERT_multi_cased.csv', index_col=0)
bert_small_2_data     = pd.read_csv('2020_05_10_encoded_BERT_small_2.csv', index_col=0)
bert_small_data       = pd.read_csv('2020_05_10_encoded_BERT_small.csv', index_col=0)
print('FastText iekodētā datu kopa:', fasttext_data.shape)
print('BERT 1 dati:', bert_small_data.shape, 'BERT 2 dati:',  bert_small_2_data.shape, 'BERT 3 dati:', bert_multi_cased_data.shape, 'BERT 4 dati:', bert_big_data.shape)

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets
/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Datasets/BERT_encoded
FastText iekodētā datu kopa: (23391, 301)
BERT 1 dati: (23391, 257) BERT 2 dati: (23391, 257) BERT 3 dati: (23391, 769) BERT 4 dati: (23391, 769)


In [0]:
indices       = range(4)
dataset_list = [bert_small_2_data, bert_small_data, bert_multi_cased_data, bert_big_data, fasttext_data]
model_names  = ['BERT small #2', 'BERT small', 'BERT multi cased', 'BERT big', 'FastText']
n_split = 5
counter = 0
batch_size = 64
epochs = 5  
results1 = []
header_row = ['Modelis', 'Vidējais F1']
results1.append(header_row)
%cd '/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Rezultāti'

/gdrive/My Drive/LU/LU DF/8. semestris/Bakalaura Darbs/Rezultāti


In [0]:
exec_start = datetime.datetime.now()
for dataset in dataset_list:
  print(model_names[counter])
  result_row = []
  result_row.append(model_names[counter])
  col_count = len(dataset.columns)                                                # Veicam datu kopas dimensionalitātes saglabāšanu
  features = dataset.iloc[:,:col_count-1]                                         # Jautājumu reprezentācijas
  target = dataset.iloc[:,col_count-1:]                                           # Jautājumu kategorijas
  features = features.to_numpy()                                                  # Tipu maiņa ērtākai tālākai darbībai
  target = target.to_numpy()

  model_f1_scores = []
  fold_counter = 0
  for train_index,test_index in StratifiedKFold(n_split).split(features, target):   # 5-Fold validācija 
    x_train, x_test  = features[train_index], features[test_index]                  # Izveidojam jautājumu kopas sadalījumu
    y_train, y_test  = target[train_index], target[test_index]                      # Izveidojam kategorijas kopas sadalījumu
     
    model = models.Sequential()                                                     # Izveidojam Neironu tīkla modeli
    model.add(layers.Dense(256, activation='relu', input_shape=(col_count-1,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(loss=['sparse_categorical_crossentropy'],
                    optimizer='adam',
                    metrics=['sparse_categorical_accuracy'])
    history = model.fit(x_train, y_train,                                         # Veicam modeļa trenēšanu, izmantojot kombinēto datu kopu
                        batch_size=batch_size,epochs=epochs,verbose=0)
    y_pred = model.predict(x_test)                                                # Prasam modelim paredzēt testa datiem kategorijas
    y_pred2 = transformResults(y_pred)                                            # Pārveidojam rezultātus

    model_f1_avg = round(f1_score(y_test, y_pred2, average='macro'),4)            # Fiksējam f1 vidējo rezultātu kārtējā K-Foldā
    model_f1_scores.append(model_f1_avg)
    fold_counter = fold_counter + 1
    print('\t\tCompleted for model: ',model_names[counter],' K-Fold #', fold_counter, 'F1 score:', model_f1_avg)
  # After K-Fold validation
  model_f1 = round(np.mean(model_f1_scores), 4)
  result_row.append(model_f1)
  print('\tCompleted for:', model_names[counter],' with F1 score:',model_f1)  
  results1.append(result_row)
  counter = counter + 1

#In the very end - fix the execution time
exec_end = datetime.datetime.now()
exec_elapsed = exec_end - exec_start
print('Total time elapsed:', exec_elapsed)

# Saglabājam visus rezultātus
result_data = pd.DataFrame(results1[1:], columns=results1[0])
result_data.to_csv('2020_05_23_BERT_FastText_papildinatie.csv')
result_data     

BERT small #2
		Completed for model:  BERT small #2  K-Fold # 1 F1 score: 0.4159
		Completed for model:  BERT small #2  K-Fold # 2 F1 score: 0.3381
		Completed for model:  BERT small #2  K-Fold # 3 F1 score: 0.6139
		Completed for model:  BERT small #2  K-Fold # 4 F1 score: 0.5601
		Completed for model:  BERT small #2  K-Fold # 5 F1 score: 0.4198
	Completed for: BERT small #2  with F1 score: 0.4696
BERT small
		Completed for model:  BERT small  K-Fold # 1 F1 score: 0.4739
		Completed for model:  BERT small  K-Fold # 2 F1 score: 0.5492
		Completed for model:  BERT small  K-Fold # 3 F1 score: 0.6524
		Completed for model:  BERT small  K-Fold # 4 F1 score: 0.7861
		Completed for model:  BERT small  K-Fold # 5 F1 score: 0.6273
	Completed for: BERT small  with F1 score: 0.6178
BERT multi cased
		Completed for model:  BERT multi cased  K-Fold # 1 F1 score: 0.5337
		Completed for model:  BERT multi cased  K-Fold # 2 F1 score: 0.4634
		Completed for model:  BERT multi cased  K-Fold # 3 F1 scor

Unnamed: 0,Modelis,Vidējais F1
0,BERT small #2,0.4696
1,BERT small,0.6178
2,BERT multi cased,0.6053
3,BERT big,0.6579
4,FastText,0.6789
