In [1]:
import pandas as pd
import torch
from src import Utils, LSTM
from skorch.dataset import ValidSplit
from copy import deepcopy
from sklearn.base import clone
from torch import optim, nn
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
MODEL_FOLDER = 'model_lstm/dropout-1'

In [3]:
# torch.use_deterministic_algorithms(True)
# %env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [4]:
dataset = Utils.read_csv_file('datasets/datasetall.csv')
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [5]:
Utils.seed_random_number_generators()

Random number generators seeded.


In [6]:
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = Utils.get_train_test_split(dataset, TEST_SIZE)

In [7]:
X_train

0         [USERNAME] Palangga ka man sang mga taga Baco...
1                      Who dafuq is Jose Montemayor Jr.???
2        Di na nakakatuwa yung mukha ni Mar Roxas sa TV...
3                      national elections. | via[USERNAME]
4        Binay will be staring in a movie called "The D...
                               ...                        
22764    "Kala ko wala andito pala si Marcos."*pertaini...
22765    sie ~ [USERNAME]Marcos Magnanakaw Marcos Dikta...
22766                    If Mar is BatMarBinay is Bane-ay.
22767    to my moots im sorry in not sorry for flooding...
22768                                Uunlad tayo kay Binay
Name: text, Length: 22769, dtype: object

In [8]:
y_train

0        0
1        0
2        1
3        0
4        1
        ..
22764    0
22765    1
22766    1
22767    1
22768    0
Name: label, Length: 22769, dtype: int64

In [9]:
X_test

0                          Bakit trending ang Only Binay?
1       Mare @ Cebu [USERNAME][USERNAME] Marcos Never ...
2       Kahit anong gawin ko bakit di ko ma appreciate...
3       Oras na para tayo'y bumoto ng taong mag tataas...
4       VP[USERNAME]is currently in Zamboanga Sibugay ...
                              ...                        
5687      [USERNAME] Laban LeniAngat Buhay LahatLeni Kiko
5688    Nagconcede ka man Maimarwala ka prinnagdala ka...
5689    Did You Know that former Philippine secretary ...
5690           Bakit nakakairita commercial ni Mar Roxas?
5691    To Doc Willie Ong I'd like to believe you are ...
Name: text, Length: 5692, dtype: object

In [10]:
y_test

0       0
1       1
2       1
3       0
4       0
       ..
5687    0
5688    1
5689    0
5690    1
5691    0
Name: label, Length: 5692, dtype: int64

In [11]:
epochs = [100, 200, 300]
learning_rate = [0.1, 0.01, 0.015, 0.02, 0.001, 0.0001, 0.00001]
# learning_rate = [0.02, 0.03, 0.04, 0.05]
batch_size = [16, 32, 64, 128]
hidden_size = [250]
num_layers = [1, 3]
dropout=[0.1, 0.2, 0.5]


In [12]:
for i in range(len(dropout)):
  print(f"Dropout: {dropout[i]}")
  train_lstm = clone(LSTM.LstmPipeline)

  train_lstm.set_params(
    lstm__train_split=None,
    lstm__module__hidden_size=250,
    lstm__module__num_layers=1,
    lstm__module__output_size=2,
    lstm__module__dropout=dropout[i],
    lstm__optimizer__lr=0.015,
    lstm__max_epochs=30,
    lstm__batch_size=32,
    # lstm__optimizer=optim.SGD,
    # lstm__criterion=nn.BCEWithLogitsLoss,
  )

  train_lstm['lstm'].callbacks[0].dirname = f'{MODEL_FOLDER}/{i}/train_lstm'
  train_lstm['lstm'].callbacks[0].monitor = 'train_loss_best'

  train_lstm.fit(X_train, y_train)

  accuracy, recall, precision, f1 = Utils.get_prediction_results(
    X_test,
    y_test,
    train_lstm,
  )

  Utils.save_trained_model(train_lstm, f"{MODEL_FOLDER}/{i}/LSTM")

  history_data_frame = pd.DataFrame(
      train_lstm['lstm'].history
  ).set_index('epoch')
  history_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_history.csv')

  metrics_data_frame = pd.DataFrame([{
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1': f1,
  }])
  metrics_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_metrics.csv')

  Utils.seed_random_number_generators()

Dropout: 0.1




  0%|          | 0/712 [00:00<?, ?it/s]

  epoch    train_loss    cp     dur
-------  ------------  ----  ------
      1        [36m0.4813[0m     +  5.1726


  0%|          | 0/712 [00:00<?, ?it/s]

      2        [36m0.4358[0m     +  4.7454


  0%|          | 0/712 [00:00<?, ?it/s]

      3        [36m0.4256[0m     +  4.7216


  0%|          | 0/712 [00:00<?, ?it/s]

      4        [36m0.4197[0m     +  4.7611


  0%|          | 0/712 [00:00<?, ?it/s]

      5        [36m0.4106[0m     +  4.7291


  0%|          | 0/712 [00:00<?, ?it/s]

      6        [36m0.4042[0m     +  5.1040


  0%|          | 0/712 [00:00<?, ?it/s]

      7        [36m0.3949[0m     +  4.9637


  0%|          | 0/712 [00:00<?, ?it/s]

      8        [36m0.3877[0m     +  4.6998


  0%|          | 0/712 [00:00<?, ?it/s]

      9        [36m0.3823[0m     +  4.7169


  0%|          | 0/712 [00:00<?, ?it/s]

     10        [36m0.3759[0m     +  4.7451


  0%|          | 0/712 [00:00<?, ?it/s]

     11        [36m0.3694[0m     +  5.6578


  0%|          | 0/712 [00:00<?, ?it/s]

     12        [36m0.3614[0m     +  5.1312


  0%|          | 0/712 [00:00<?, ?it/s]

     13        [36m0.3574[0m     +  4.8666


  0%|          | 0/712 [00:00<?, ?it/s]

     14        [36m0.3508[0m     +  4.8123


  0%|          | 0/712 [00:00<?, ?it/s]

     15        [36m0.3470[0m     +  4.7839


  0%|          | 0/712 [00:00<?, ?it/s]

     16        [36m0.3412[0m     +  4.6939


  0%|          | 0/712 [00:00<?, ?it/s]

     17        [36m0.3343[0m     +  4.7817


  0%|          | 0/712 [00:00<?, ?it/s]

     18        [36m0.3297[0m     +  4.7299


  0%|          | 0/712 [00:00<?, ?it/s]

     19        [36m0.3265[0m     +  4.7866


  0%|          | 0/712 [00:00<?, ?it/s]

     20        [36m0.3180[0m     +  4.7969


  0%|          | 0/712 [00:00<?, ?it/s]

     21        0.3206        4.8450


  0%|          | 0/712 [00:00<?, ?it/s]

     22        [36m0.3165[0m     +  4.7531


  0%|          | 0/712 [00:00<?, ?it/s]

     23        [36m0.3113[0m     +  4.7212


  0%|          | 0/712 [00:00<?, ?it/s]

     24        [36m0.3007[0m     +  4.7916


  0%|          | 0/712 [00:00<?, ?it/s]

     25        [36m0.2997[0m     +  4.7691


  0%|          | 0/712 [00:00<?, ?it/s]

     26        [36m0.2926[0m     +  4.8270


  0%|          | 0/712 [00:00<?, ?it/s]

     27        [36m0.2914[0m     +  4.8500


  0%|          | 0/712 [00:00<?, ?it/s]

     28        [36m0.2885[0m     +  4.8751


  0%|          | 0/712 [00:00<?, ?it/s]

     29        [36m0.2843[0m     +  4.7155


  0%|          | 0/712 [00:00<?, ?it/s]

     30        [36m0.2804[0m     +  4.8307
Accuracy: 0.7853127196064652
Recall: 0.7894736842105263
Precision: 0.785639958376691
F1-score: 0.7875521557719055
Ensemble model saved to Pipeline(steps=[('tokenizer', CalamancyTokenizer()),
                ('lstm',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=LstmModel(
    (lstm): LSTM(200, 250, batch_first=True, dropout=0.1)
    (fc): Linear(in_features=250, out_features=2, bias=True)
  ),
))]).pkl
Random number generators seeded.
Dropout: 0.2




  0%|          | 0/712 [00:00<?, ?it/s]

  epoch    train_loss    cp     dur
-------  ------------  ----  ------
      1        [36m0.4813[0m     +  5.0355


  0%|          | 0/712 [00:00<?, ?it/s]

      2        [36m0.4358[0m     +  4.3679


  0%|          | 0/712 [00:00<?, ?it/s]

      3        [36m0.4256[0m     +  4.3475


  0%|          | 0/712 [00:00<?, ?it/s]

      4        [36m0.4197[0m     +  4.2406


  0%|          | 0/712 [00:00<?, ?it/s]

      5        [36m0.4106[0m     +  4.2507


  0%|          | 0/712 [00:00<?, ?it/s]

      6        [36m0.4042[0m     +  4.2814


  0%|          | 0/712 [00:00<?, ?it/s]

      7        [36m0.3949[0m     +  4.7160


  0%|          | 0/712 [00:00<?, ?it/s]

      8        [36m0.3877[0m     +  4.5997


  0%|          | 0/712 [00:00<?, ?it/s]

      9        [36m0.3823[0m     +  5.5499


  0%|          | 0/712 [00:00<?, ?it/s]

     10        [36m0.3759[0m     +  6.5223


  0%|          | 0/712 [00:00<?, ?it/s]

     11        [36m0.3694[0m     +  8.3340


  0%|          | 0/712 [00:00<?, ?it/s]

     12        [36m0.3614[0m     +  5.5859


  0%|          | 0/712 [00:00<?, ?it/s]

     13        [36m0.3574[0m     +  5.9603


  0%|          | 0/712 [00:00<?, ?it/s]

     14        [36m0.3508[0m     +  5.3875


  0%|          | 0/712 [00:00<?, ?it/s]

     15        [36m0.3470[0m     +  5.5356


  0%|          | 0/712 [00:00<?, ?it/s]

     16        [36m0.3412[0m     +  7.0213


  0%|          | 0/712 [00:00<?, ?it/s]

     17        [36m0.3343[0m     +  4.9197


  0%|          | 0/712 [00:00<?, ?it/s]

     18        [36m0.3297[0m     +  4.3816


  0%|          | 0/712 [00:00<?, ?it/s]

     19        [36m0.3265[0m     +  5.1421


  0%|          | 0/712 [00:00<?, ?it/s]

     20        [36m0.3180[0m     +  4.9128


  0%|          | 0/712 [00:00<?, ?it/s]

     21        0.3206        5.3528


  0%|          | 0/712 [00:00<?, ?it/s]

     22        [36m0.3165[0m     +  4.8668


  0%|          | 0/712 [00:00<?, ?it/s]

     23        [36m0.3113[0m     +  6.1099


  0%|          | 0/712 [00:00<?, ?it/s]

     24        [36m0.3007[0m     +  6.1235


  0%|          | 0/712 [00:00<?, ?it/s]

     25        [36m0.2997[0m     +  5.2813


  0%|          | 0/712 [00:00<?, ?it/s]

     26        [36m0.2926[0m     +  9.6498


  0%|          | 0/712 [00:00<?, ?it/s]

     27        [36m0.2914[0m     +  7.2749


  0%|          | 0/712 [00:00<?, ?it/s]

     28        [36m0.2885[0m     +  7.4444


  0%|          | 0/712 [00:00<?, ?it/s]

     29        [36m0.2843[0m     +  6.3203


  0%|          | 0/712 [00:00<?, ?it/s]

     30        [36m0.2804[0m     +  7.9182
Accuracy: 0.7853127196064652
Recall: 0.7894736842105263
Precision: 0.785639958376691
F1-score: 0.7875521557719055
Ensemble model saved to Pipeline(steps=[('tokenizer', CalamancyTokenizer()),
                ('lstm',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=LstmModel(
    (lstm): LSTM(200, 250, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=250, out_features=2, bias=True)
  ),
))]).pkl
Random number generators seeded.
Dropout: 0.5


KeyboardInterrupt: 

In [None]:
print("Finished")

Finished
