In [13]:
import pandas as pd
import torch
from src import Utils, LSTM
from skorch.dataset import ValidSplit
from copy import deepcopy
from sklearn.base import clone
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

In [14]:
MODEL_FOLDER = 'model_lstm'

In [15]:
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [16]:
dataset = Utils.read_csv_file('datasets/datasetall.csv')
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [17]:
Utils.seed_random_number_generators()

Random number generators seeded.


In [18]:
TEST_SIZE = 0.1

X_train, X_test, y_train, y_test = Utils.get_train_test_split(dataset, TEST_SIZE)

In [19]:
X_train

0        pag hindi nanalo si Norberto Gonzales pwede ba...
1        Ngayon lang ako super proud sa PRESIDENTE na i...
2        JUST SAW SOMEONE CALL BBM BLENGBLONG HAHAHAHAH...
3        Rep. Binay on her leadership style: I am very ...
4        Liwanag o dilim? May oras pa. Kakampink Leni L...
                               ...                        
25611    "Kala ko wala andito pala si Marcos."*pertaini...
25612    cathy [USERNAME] Dec [USERNAME] parang tanga i...
25613                             Nognog+pandak= BINAY ftw
25614    BINAY:Did your enormous wealth all come from y...
25615                                Uunlad tayo kay Binay
Name: text, Length: 25616, dtype: object

In [20]:
y_train

0        1
1        0
2        1
3        0
4        0
        ..
25611    0
25612    1
25613    1
25614    1
25615    0
Name: label, Length: 25616, dtype: int64

In [21]:
X_test

0       PRESIDENTE DUTERTE I'm sure in last debateitao...
1       CHANGE IS BADLY NEEDED No To Mar Roxas2016 Dut...
2                                One Pink March Leni Kiko
3                               see youuu later Leni Kiko
4       [USERNAME] Nangyari na yan eh pero kahit anong...
                              ...                        
2840    kaya siguro umabot ng milyon yung boto kay MAR...
2841    Dedicating my 21km run for my chosen Presand V...
2842    Bakit si Mar? Because DuterteGrace Poe and VP ...
2843    patalo po ung patalastas ni Mar Roxas....malas...
2844    Kapihan with Sen. Bongbong Marcos startshe say...
Name: text, Length: 2845, dtype: object

In [22]:
y_test

0       0
1       0
2       0
3       0
4       0
       ..
2840    1
2841    0
2842    1
2843    1
2844    0
Name: label, Length: 2845, dtype: int64

In [23]:
epochs = [100, 200, 300]
learning_rate = [0.02, 0.03, 0.04, 0.05]
batch_size = [16, 32, 64, 128]
hidden_size = [250]
num_layers = [1, 2, 3]


In [24]:
for i in range(len(hidden_size)):
  print(f"Hidden Size: {hidden_size[i]}")
  train_lstm = clone(LSTM.LstmPipeline)

  train_lstm.set_params(
    lstm__train_split=None,
    lstm__module__hidden_size=hidden_size[i],
    lstm__optimizer__lr=0.015,
    lstm__max_epochs=30,
    lstm__batch_size=32,
    lstm__optimizer=Adam,
    lstm__criterion=CrossEntropyLoss,
  )

  train_lstm['lstm'].callbacks[0].dirname = f'{MODEL_FOLDER}/{i}/train_lstm'
  train_lstm['lstm'].callbacks[0].monitor = 'train_loss_best'

  train_lstm.fit(X_train, y_train)

  accuracy, recall, precision, f1 = Utils.get_prediction_results(
    X_test,
    y_test,
    train_lstm,
  )

  Utils.save_trained_model(train_lstm, f"{MODEL_FOLDER}/{i}/LSTM")

  history_data_frame = pd.DataFrame(
      train_lstm['lstm'].history
  ).set_index('epoch')
  history_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_history.csv')

  metrics_data_frame = pd.DataFrame([{
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1': f1,
  }])
  metrics_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_metrics.csv')

  Utils.seed_random_number_generators()

Hidden Size: 250


  0%|          | 0/801 [00:00<?, ?it/s]

RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information, go to https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility

In [None]:
print("Finished")

Finished
