In [1]:
import pandas as pd
import torch
from src import Utils, LSTM
from skorch.dataset import ValidSplit
from copy import deepcopy
from sklearn.base import clone
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
MODEL_FOLDER = 'model_lstm'

In [3]:
# torch.use_deterministic_algorithms(True)
# %env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [4]:
dataset = Utils.read_csv_file('datasets/datasetall.csv')
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [5]:
Utils.seed_random_number_generators()

Random number generators seeded.


In [6]:
TEST_SIZE = 0.1

X_train, X_test, y_train, y_test = Utils.get_train_test_split(dataset, TEST_SIZE)

In [7]:
X_train

0        pag hindi nanalo si Norberto Gonzales pwede ba...
1        Ngayon lang ako super proud sa PRESIDENTE na i...
2        JUST SAW SOMEONE CALL BBM BLENGBLONG HAHAHAHAH...
3        Rep. Binay on her leadership style: I am very ...
4        Liwanag o dilim? May oras pa. Kakampink Leni L...
                               ...                        
25611    "Kala ko wala andito pala si Marcos."*pertaini...
25612    cathy [USERNAME] Dec [USERNAME] parang tanga i...
25613                             Nognog+pandak= BINAY ftw
25614    BINAY:Did your enormous wealth all come from y...
25615                                Uunlad tayo kay Binay
Name: text, Length: 25616, dtype: object

In [8]:
y_train

0        1
1        0
2        1
3        0
4        0
        ..
25611    0
25612    1
25613    1
25614    1
25615    0
Name: label, Length: 25616, dtype: int64

In [9]:
X_test

0       PRESIDENTE DUTERTE I'm sure in last debateitao...
1       CHANGE IS BADLY NEEDED No To Mar Roxas2016 Dut...
2                                One Pink March Leni Kiko
3                               see youuu later Leni Kiko
4       [USERNAME] Nangyari na yan eh pero kahit anong...
                              ...                        
2840    kaya siguro umabot ng milyon yung boto kay MAR...
2841    Dedicating my 21km run for my chosen Presand V...
2842    Bakit si Mar? Because DuterteGrace Poe and VP ...
2843    patalo po ung patalastas ni Mar Roxas....malas...
2844    Kapihan with Sen. Bongbong Marcos startshe say...
Name: text, Length: 2845, dtype: object

In [10]:
y_test

0       0
1       0
2       0
3       0
4       0
       ..
2840    1
2841    0
2842    1
2843    1
2844    0
Name: label, Length: 2845, dtype: int64

In [11]:
epochs = [100, 200, 300]
learning_rate = [0.02, 0.03, 0.04, 0.05]
batch_size = [16, 32, 64, 128]
hidden_size = [250]
num_layers = [1, 3]


In [12]:
for i in range(len(num_layers)):
  print(f"Num layers: {num_layers[i]}")
  train_lstm = clone(LSTM.LstmPipeline)

  train_lstm.set_params(
    lstm__train_split=None,
    lstm__module__hidden_size=250,
    lstm__module__num_layers=num_layers[i],
    lstm__optimizer__lr=0.015,
    lstm__max_epochs=30,
    lstm__batch_size=32,
    lstm__optimizer=Adam,
    lstm__criterion=CrossEntropyLoss,
  )

  train_lstm['lstm'].callbacks[0].dirname = f'{MODEL_FOLDER}/{i}/train_lstm'
  train_lstm['lstm'].callbacks[0].monitor = 'train_loss_best'

  train_lstm.fit(X_train, y_train)

  accuracy, recall, precision, f1 = Utils.get_prediction_results(
    X_test,
    y_test,
    train_lstm,
  )

  Utils.save_trained_model(train_lstm, f"{MODEL_FOLDER}/{i}/LSTM")

  history_data_frame = pd.DataFrame(
      train_lstm['lstm'].history
  ).set_index('epoch')
  history_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_history.csv')

  metrics_data_frame = pd.DataFrame([{
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1': f1,
  }])
  metrics_data_frame.to_csv(f'{MODEL_FOLDER}/{i}/lstm_metrics.csv')

  Utils.seed_random_number_generators()

Num layers: 1


  0%|          | 0/801 [00:00<?, ?it/s]

  epoch    train_loss    cp     dur
-------  ------------  ----  ------
      1        [36m0.4758[0m     +  6.1341


  0%|          | 0/801 [00:00<?, ?it/s]

      2        [36m0.4375[0m     +  5.5734


  0%|          | 0/801 [00:00<?, ?it/s]

      3        [36m0.4194[0m     +  5.6795


  0%|          | 0/801 [00:00<?, ?it/s]

      4        [36m0.4102[0m     +  5.4557


  0%|          | 0/801 [00:00<?, ?it/s]

      5        [36m0.4071[0m     +  5.5032


  0%|          | 0/801 [00:00<?, ?it/s]

      6        [36m0.4022[0m     +  5.5029


  0%|          | 0/801 [00:00<?, ?it/s]

      7        [36m0.3955[0m     +  5.4130


  0%|          | 0/801 [00:00<?, ?it/s]

      8        [36m0.3878[0m     +  5.4341


  0%|          | 0/801 [00:00<?, ?it/s]

      9        [36m0.3846[0m     +  5.5967


  0%|          | 0/801 [00:00<?, ?it/s]

     10        [36m0.3839[0m     +  5.4400


  0%|          | 0/801 [00:00<?, ?it/s]

     11        0.3843        5.3300


  0%|          | 0/801 [00:00<?, ?it/s]

     12        0.3888        5.4742


  0%|          | 0/801 [00:00<?, ?it/s]

     13        0.4018        5.5497


  0%|          | 0/801 [00:00<?, ?it/s]

     14        0.3909        5.5292


  0%|          | 0/801 [00:00<?, ?it/s]

     15        0.3874        5.6129


  0%|          | 0/801 [00:00<?, ?it/s]

     16        [36m0.3810[0m     +  5.8871


  0%|          | 0/801 [00:00<?, ?it/s]

     17        0.3820        5.5210


  0%|          | 0/801 [00:00<?, ?it/s]

     18        [36m0.3809[0m     +  5.4290


  0%|          | 0/801 [00:00<?, ?it/s]

     19        [36m0.3807[0m     +  5.5480


  0%|          | 0/801 [00:00<?, ?it/s]

     20        [36m0.3805[0m     +  5.4257


  0%|          | 0/801 [00:00<?, ?it/s]

     21        [36m0.3791[0m     +  5.4225


  0%|          | 0/801 [00:00<?, ?it/s]

     22        0.3798        5.4564


  0%|          | 0/801 [00:00<?, ?it/s]

     23        [36m0.3782[0m     +  5.5559


  0%|          | 0/801 [00:00<?, ?it/s]

     24        [36m0.3776[0m     +  5.6265


  0%|          | 0/801 [00:00<?, ?it/s]

     25        [36m0.3774[0m     +  5.4699


  0%|          | 0/801 [00:00<?, ?it/s]

     26        [36m0.3746[0m     +  5.7780


  0%|          | 0/801 [00:00<?, ?it/s]

     27        [36m0.3731[0m     +  5.7848


  0%|          | 0/801 [00:00<?, ?it/s]

     28        [36m0.3709[0m     +  5.6670


  0%|          | 0/801 [00:00<?, ?it/s]

     29        0.3739        5.6545


  0%|          | 0/801 [00:00<?, ?it/s]

     30        0.3709        5.5522
Accuracy: 0.8056239015817224
Recall: 0.896094839609484
Precision: 0.7608052101835405
F1-score: 0.8229266730707653
Ensemble model saved to Pipeline(steps=[('tokenizer', CalamancyTokenizer()),
                ('lstm',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=LstmModel(
    (lstm): LSTM(200, 250, batch_first=True)
    (fc): Linear(in_features=250, out_features=2, bias=True)
  ),
))]).pkl
Random number generators seeded.
Num layers: 3


  0%|          | 0/801 [00:00<?, ?it/s]

  epoch    train_loss    cp      dur
-------  ------------  ----  -------
      1        [36m0.7167[0m     +  13.1293


  0%|          | 0/801 [00:00<?, ?it/s]

      2        [36m0.7086[0m     +  11.3033


  0%|          | 0/801 [00:00<?, ?it/s]

      3        0.7125        11.6649


  0%|          | 0/801 [00:00<?, ?it/s]

      4        [36m0.7055[0m     +  10.9419


  0%|          | 0/801 [00:00<?, ?it/s]

      5        [36m0.7047[0m     +  10.2751


  0%|          | 0/801 [00:00<?, ?it/s]

      6        0.7049        9.9633


  0%|          | 0/801 [00:00<?, ?it/s]

      7        0.7502        10.0843


  0%|          | 0/801 [00:00<?, ?it/s]

      8        0.7102        10.0494


  0%|          | 0/801 [00:00<?, ?it/s]

      9        0.7357        9.9624


  0%|          | 0/801 [00:00<?, ?it/s]

     10        0.7160        11.2030


  0%|          | 0/801 [00:00<?, ?it/s]

     11        0.7125        10.2382


  0%|          | 0/801 [00:00<?, ?it/s]

     12        0.7121        10.0565


  0%|          | 0/801 [00:00<?, ?it/s]

     13        0.7098        9.8438


  0%|          | 0/801 [00:00<?, ?it/s]

     14        0.7097        9.8223


  0%|          | 0/801 [00:00<?, ?it/s]

     15        0.7087        9.9309


  0%|          | 0/801 [00:00<?, ?it/s]

     16        0.7091        9.9195


  0%|          | 0/801 [00:00<?, ?it/s]

     17        0.7132        9.8342


  0%|          | 0/801 [00:00<?, ?it/s]

     18        0.7124        10.0016


  0%|          | 0/801 [00:00<?, ?it/s]

     19        0.7113        9.8829


  0%|          | 0/801 [00:00<?, ?it/s]

     20        0.7121        9.9816


  0%|          | 0/801 [00:00<?, ?it/s]

     21        [36m0.7038[0m     +  10.0041


  0%|          | 0/801 [00:00<?, ?it/s]

     22        [36m0.7033[0m     +  9.9988


  0%|          | 0/801 [00:00<?, ?it/s]

     23        0.7070        9.9326


  0%|          | 0/801 [00:00<?, ?it/s]

     24        0.7601        10.0419


  0%|          | 0/801 [00:00<?, ?it/s]

     25        0.7124        9.9630


  0%|          | 0/801 [00:00<?, ?it/s]

     26        0.7252        10.0558


  0%|          | 0/801 [00:00<?, ?it/s]

     27        0.7163        9.8811


  0%|          | 0/801 [00:00<?, ?it/s]

     28        0.7121        9.8112


  0%|          | 0/801 [00:00<?, ?it/s]

     29        [36m0.6968[0m     +  9.8179


  0%|          | 0/801 [00:00<?, ?it/s]

     30        [36m0.5860[0m     +  9.8858
Accuracy: 0.7184534270650264
Recall: 0.9386331938633193
Precision: 0.6537153958232151
F1-score: 0.7706842255940451
Ensemble model saved to Pipeline(steps=[('tokenizer', CalamancyTokenizer()),
                ('lstm',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=LstmModel(
    (lstm): LSTM(200, 250, num_layers=3, batch_first=True)
    (fc): Linear(in_features=250, out_features=2, bias=True)
  ),
))]).pkl
Random number generators seeded.


In [13]:
print("Finished")

Finished
