In [25]:
import pandas as pd
import numpy as np
import math
from torch import nn, optim, device, cuda, Tensor, LongTensor
from src import Utils, LSTM

import calamancy

In [2]:
TEST_SIZE = 0.2

INPUT_SIZE = 200
NUM_OF_HIDDEN_NODES = 50
OUTPUT_SIZE = 2

LEARNING_RATE = 0.01

SAVE_MODEL = False
MODEL_FOLDER = 'models/model_lstm'
DATASET = 'datasets/datasetall.csv'

In [3]:
if SAVE_MODEL:
  os.makedirs(
    MODEL_FOLDER,
    exist_ok=True, # Create folder if it doesn't exist, else do nothing
  )

  print(f"Saving model at '{MODEL_FOLDER}'")

In [4]:
def read_csv_file(filename: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(filename, lineterminator='\n', usecols=range(2))
        print("CSV file read successfully!")
        return data
    except FileNotFoundError:
        print("ERROR: File not found")
        exit(1)

dataset = read_csv_file(DATASET)
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [5]:
dataset['label'].value_counts(ascending=True)

label
0    14115
1    14346
Name: count, dtype: int64

In [6]:
def shuffle_data_frame(data_frame):
    text = list(data_frame['text'])
    label = list(data_frame['label'])

    assert(len(text) == len(label))

    indices = list(range(len(label)))

    # Make a random number generator that will shuffle list of indices
    # It is seeded to be reproducible
    random_number_generator = np.random.default_rng(seed=0)
    random_number_generator.shuffle(indices)

    shuffled_text = []
    shuffled_labels = []

    # Iterate through the list of indices and add the original data
    # from those shuffled indices
    for index in indices:
        shuffled_text.append(text[index])
        shuffled_labels.append(label[index])

    return pd.DataFrame({
        'text': shuffled_text,
        'label': shuffled_labels,
    })


def get_train_test_split(data_frame: pd.DataFrame, test_size: float):
    """
    Makes a stratified train test split.
    This aims to preserve the distribution between classes.
    """
    if not (1 > test_size > 0):
        print('ERROR: test_size must be between 0 and 1')
        return

    data_frame = shuffle_data_frame(data_frame)

    data_frame_length = len(data_frame)
    train_size = 1 - test_size

    nonhate_rows = data_frame[data_frame['label'] == 0] 
    nonhate_row_length = len(nonhate_rows)

    nonhate_row_train_size = math.ceil(nonhate_row_length * train_size)

    nonhate_row_train = nonhate_rows[0:nonhate_row_train_size]
    nonhate_row_test = nonhate_rows[nonhate_row_train_size:nonhate_row_length]

    assert(len(nonhate_row_train) + len(nonhate_row_test) == nonhate_row_length)

    hate_rows = data_frame[data_frame['label'] == 1] 
    hate_row_length = len(hate_rows)

    hate_row_train_size = math.ceil(hate_row_length * train_size)

    hate_row_train = hate_rows[0:hate_row_train_size]
    hate_row_test = hate_rows[hate_row_train_size:hate_row_length]

    assert(len(hate_row_train) + len(hate_row_test) == hate_row_length)

    combined_train = pd.concat([nonhate_row_train, hate_row_train])
    combined_test = pd.concat([nonhate_row_test, hate_row_test])

    shuffled_train = shuffle_data_frame(combined_train)
    shuffled_test = shuffle_data_frame(combined_test)

    return (
        shuffled_train['text'],
        shuffled_test['text'],
        shuffled_train['label'],
        shuffled_test['label'],
    )

X_train, X_test, y_train, y_test = get_train_test_split(dataset, TEST_SIZE)

In [7]:
pd.DataFrame(X_train)

Unnamed: 0,text
0,[USERNAME] Palangga ka man sang mga taga Baco...
1,Who dafuq is Jose Montemayor Jr.???
2,Di na nakakatuwa yung mukha ni Mar Roxas sa TV...
3,national elections. | via[USERNAME]
4,"Binay will be staring in a movie called ""The D..."
...,...
22764,"""Kala ko wala andito pala si Marcos.""*pertaini..."
22765,sie ~ [USERNAME]Marcos Magnanakaw Marcos Dikta...
22766,If Mar is BatMarBinay is Bane-ay.
22767,to my moots im sorry in not sorry for flooding...


In [8]:
y_train_dataframe = pd.DataFrame(y_train, columns=['label'])
y_train_dataframe

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,1
...,...
22764,0
22765,1
22766,1
22767,1


In [9]:
y_train_dataframe.value_counts(ascending=True)

label
0        11292
1        11477
Name: count, dtype: int64

In [10]:
pd.DataFrame(X_test)

Unnamed: 0,text
0,Bakit trending ang Only Binay?
1,Mare @ Cebu [USERNAME][USERNAME] Marcos Never ...
2,Kahit anong gawin ko bakit di ko ma appreciate...
3,Oras na para tayo'y bumoto ng taong mag tataas...
4,VP[USERNAME]is currently in Zamboanga Sibugay ...
...,...
5687,[USERNAME] Laban LeniAngat Buhay LahatLeni Kiko
5688,Nagconcede ka man Maimarwala ka prinnagdala ka...
5689,Did You Know that former Philippine secretary ...
5690,Bakit nakakairita commercial ni Mar Roxas?


In [12]:
y_test_dataframe = pd.DataFrame(y_test, columns=['label'])
y_test_dataframe

Unnamed: 0,label
0,0
1,1
2,1
3,0
4,0
...,...
5687,0
5688,1
5689,0
5690,1


In [13]:
y_test_dataframe.value_counts(ascending=True)

label
0        2823
1        2869
Name: count, dtype: int64

## CalamanCy

In [7]:
Calamancy = calamancy.load("tl_calamancy_md-0.1.0")

Calamancy



<spacy.lang.tl.Tagalog at 0x7f1fd821d1c0>

In [16]:
import tqdm

def get_calamancy_tokens(data):
  # Allows it to work with both dataframes and
  # simple lists of strings
  if isinstance(data, pd.Series):
    data = data.values

  samples = []

  progress_bar = tqdm.tqdm(total=len(data))

  for sample in Calamancy.pipe(data):
    progress_bar.update(1)
    tokens = [
      token
      for token 
      in sample
      if not token.is_punct
    ]

    samples.append(tokens)

  progress_bar.close()

  return samples

def get_token_vectors(tokens):
  vectors = []

  progress_bar = tqdm.tqdm(total=len(tokens))

  for sample in tokens:
    progress_bar.update(1)
    vector = np.mean(
      [token.vector for token in sample],
      axis=0,
    )

    vectors.append(vector)

  progress_bar.close()

  return vectors

In [9]:
X_train_tokens = get_calamancy_tokens(X_train)

  0%|          | 0/22769 [00:00<?, ?it/s]

100%|██████████| 22769/22769 [04:03<00:00, 93.68it/s] 


In [10]:
pd.DataFrame(X_train_tokens[:20])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,,USERNAME,Palangga,ka,man,sang,mga,taga,Bacolod,vp,...,,,,,,,,,,
1,Who,dafuq,is,Jose,Montemayor,Jr,,,,,...,,,,,,,,,,
2,Di,na,nakakatuwa,yung,mukha,ni,Mar,Roxas,sa,TV,...,,,,,,,,,,
3,national,elections,|,via[USERNAME,,,,,,,...,,,,,,,,,,
4,Binay,will,be,staring,in,a,movie,called,The,Dark,...,,,,,,,,,,
5,The,fact,that,we,are,even,discussing,the,probability,of,...,,,,,,,,,,
6,USERNAME,Lito,Atienza,na,goi,,,,,,...,,,,,,,,,,
7,,Yung,mga,yaman,ng,Marcos,na,ninakaw,nila,bawiin,...,,,,,,,,,,
8,Ay,dyuskoJudgemental,peepsI,KENNAT,Election,PH,,,,,...,,,,,,,,,,
9,Mas,pipiliin,kong,si,Mar,Roxas,ang,manalo,kesa,kay,...,,,,,,,,,,


In [17]:
X_train_vectors = get_token_vectors(X_train_tokens)

100%|██████████| 22769/22769 [00:27<00:00, 825.02it/s]


In [18]:
pd.DataFrame(X_train_vectors[:20])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.201896,-0.098679,0.202629,0.100888,-0.126542,-0.019928,0.195995,-0.176912,0.011754,-0.472818,...,0.15869,0.126213,-0.123086,0.151638,0.052458,-0.100496,0.082392,-0.050556,-0.207549,-0.252894
1,0.13055,0.24025,0.170797,-0.086086,0.064775,-0.117744,0.029172,-0.458066,-0.038224,-0.418824,...,0.064635,0.240563,-0.013853,0.014041,-0.270232,0.398339,0.098376,0.318264,-0.115856,0.006477
2,-0.230588,0.169705,-0.032717,-0.149275,-0.04721,-0.109674,0.324299,-0.196976,-0.140072,-0.080164,...,0.300772,0.157875,-0.046591,0.11093,0.263457,-0.009701,-0.261329,0.037063,0.029568,-0.392125
3,0.333349,-0.079338,0.032911,-0.077019,0.202654,0.148299,0.492151,-0.280707,-0.086112,-0.468649,...,0.127474,-0.129019,0.184873,-0.077739,-0.236017,-0.223829,0.269438,0.377585,0.085561,-0.081632
4,-0.062128,0.015108,0.218961,0.229353,0.389644,-0.221156,0.192981,-0.354057,0.099793,-0.149075,...,0.044785,0.260572,-0.173939,0.032819,-0.163195,-0.011637,-0.080519,0.113606,-0.112847,-0.259416
5,0.095129,0.216448,0.192956,0.193494,0.638228,-0.008904,0.426423,-0.339432,-0.02016,-0.276543,...,0.260881,0.316046,-0.127748,-0.046058,-0.252485,-0.115437,-0.1721,0.187988,-0.063901,-0.133963
6,0.085855,0.180508,0.135688,-0.089693,0.047697,-0.057927,0.794444,-0.20725,0.254828,-0.207767,...,-0.084057,0.039359,0.24906,0.099819,0.150081,0.160108,0.08668,0.10333,-0.189651,-0.152889
7,-0.043678,0.192529,0.218011,-0.023359,-0.119977,-0.281286,0.021059,-0.118914,-0.234168,-0.316062,...,0.154782,-0.226008,-0.265473,-0.135462,0.221872,0.102385,0.115639,0.044977,-0.155364,-0.116667
8,0.108935,0.206184,0.279004,-0.149535,0.168271,0.330873,0.307882,-0.543837,0.040067,-0.102028,...,0.000555,0.18354,-0.039099,-0.016287,-0.141202,-0.270593,0.144077,0.142775,0.040365,0.013068
9,-0.184766,0.107415,-0.015957,-0.186998,-0.03119,-0.19238,0.221244,-0.306518,-0.095913,-0.217499,...,0.156743,-0.007496,-0.185448,0.04522,0.141532,-0.055748,0.085846,0.035124,-0.09983,-0.169312


## LSTM

In [49]:
class LstmModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(
      INPUT_SIZE,
      NUM_OF_HIDDEN_NODES,
      batch_first=True,
    )
    self.linear = nn.Linear(NUM_OF_HIDDEN_NODES, OUTPUT_SIZE)

    self.lstm_output = None

  def forward(self, input):
    self.lstm_output = self.lstm(input)

    linear_output = self.linear(self.lstm_output[0])

    return linear_output

Lstm = LstmModel()

Lstm

LstmModel(
  (lstm): LSTM(200, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

In [27]:
X_train_tensor = Tensor(np.array(X_train_vectors))

In [53]:
Lstm(X_train_tensor)

tensor([[0.0286, 0.1445],
        [0.0361, 0.1697],
        [0.0463, 0.1906],
        ...,
        [0.0490, 0.2238],
        [0.0334, 0.2209],
        [0.0063, 0.2158]], grad_fn=<AddmmBackward0>)

In [54]:
Lstm.lstm_output

(tensor([[-0.0720, -0.0221, -0.0835,  ...,  0.1020,  0.0046, -0.0420],
         [-0.1184,  0.0031, -0.0662,  ...,  0.0703, -0.0842, -0.0680],
         [-0.0985,  0.0417, -0.1145,  ...,  0.1113, -0.0627, -0.0116],
         ...,
         [-0.1297, -0.0269, -0.1396,  ...,  0.1114, -0.0551, -0.0808],
         [-0.1763, -0.0243, -0.1842,  ...,  0.0198, -0.1671, -0.1007],
         [-0.0751,  0.0468, -0.0923,  ...,  0.1281, -0.1036, -0.0899]],
        grad_fn=<SqueezeBackward1>),
 (tensor([[-0.0751,  0.0468, -0.0923, -0.0378,  0.0694, -0.1539, -0.0977,  0.1297,
            0.0302,  0.1738,  0.0197, -0.0415, -0.1067, -0.0837, -0.0131,  0.1425,
           -0.0613,  0.0867, -0.0099,  0.0662, -0.0316, -0.0215, -0.1154, -0.1064,
            0.0482, -0.0573,  0.1912, -0.0152, -0.0005, -0.1083,  0.0515,  0.0391,
            0.0032,  0.0424, -0.0266,  0.2285, -0.1485,  0.1319,  0.1029, -0.0299,
           -0.1471,  0.0591, -0.2040, -0.0525,  0.0521,  0.0066, -0.1493,  0.1281,
           -0.1036, -0.0

In [11]:
train_lstm = LSTM.LstmPipeline

In [12]:
train_lstm

In [13]:
train_lstm.fit(X_train, y_train)

  0%|          | 0/1 [00:00<?, ?it/s]

  epoch    train_loss    cp     dur
-------  ------------  ----  ------
      1        [36m0.6943[0m     +  2.8459


  0%|          | 0/1 [00:00<?, ?it/s]

      2        1.2824        2.4968


  0%|          | 0/1 [00:00<?, ?it/s]

      3        0.7074        2.6408


  0%|          | 0/1 [00:00<?, ?it/s]

      4        1.2360        2.5040


  0%|          | 0/1 [00:00<?, ?it/s]

      5        1.1368        2.4908


  0%|          | 0/1 [00:00<?, ?it/s]

      6        0.7182        2.6382


  0%|          | 0/1 [00:00<?, ?it/s]

      7        0.7412        2.4682


  0%|          | 0/1 [00:00<?, ?it/s]

      8        0.7419        2.6623


  0%|          | 0/1 [00:00<?, ?it/s]

      9        [36m0.6898[0m     +  2.4815


  0%|          | 0/1 [00:00<?, ?it/s]

     10        0.7377        2.5466


  0%|          | 0/1 [00:00<?, ?it/s]

     11        0.7142        2.6251


  0%|          | 0/1 [00:00<?, ?it/s]

     12        0.6908        2.5134


  0%|          | 0/1 [00:00<?, ?it/s]

     13        0.7220        2.6289


  0%|          | 0/1 [00:00<?, ?it/s]

     14        0.7053        2.4770


  0%|          | 0/1 [00:00<?, ?it/s]

     15        [36m0.6887[0m     +  2.5650


  0%|          | 0/1 [00:00<?, ?it/s]

     16        0.7102        2.6166


  0%|          | 0/1 [00:00<?, ?it/s]

     17        0.6992        2.4737


  0%|          | 0/1 [00:00<?, ?it/s]

     18        [36m0.6843[0m     +  2.6276


  0%|          | 0/1 [00:00<?, ?it/s]

     19        0.6986        2.5151


  0%|          | 0/1 [00:00<?, ?it/s]

     20        0.6903        2.4738


  0%|          | 0/1 [00:00<?, ?it/s]

     21        [36m0.6767[0m     +  2.6333


  0%|          | 0/1 [00:00<?, ?it/s]

     22        0.6860        2.4801


  0%|          | 0/1 [00:00<?, ?it/s]

     23        0.6776        2.6459


  0%|          | 0/1 [00:00<?, ?it/s]

     24        [36m0.6658[0m     +  2.4777


  0%|          | 0/1 [00:00<?, ?it/s]

     25        0.6738        2.4960


  0%|          | 0/1 [00:00<?, ?it/s]

     26        [36m0.6628[0m     +  2.6443


  0%|          | 0/1 [00:00<?, ?it/s]

     27        [36m0.6561[0m     +  2.5308


  0%|          | 0/1 [00:00<?, ?it/s]

     28        0.6579        2.4827


  0%|          | 0/1 [00:00<?, ?it/s]

     29        [36m0.6439[0m     +  2.6539


  0%|          | 0/1 [00:00<?, ?it/s]

     30        [36m0.6429[0m     +  2.4789


In [14]:
accuracy, recall, precision, f1 = Utils.get_prediction_results(
  X_test,
  y_test,
  train_lstm,
)

Accuracy: 0.671293042867182
Recall: 0.5238759149529453
Precision: 0.7485059760956175
F1-score: 0.6163625179413574


In [15]:
Utils.save_trained_model(train_lstm, f"{MODEL_FOLDER}/LSTM")

Ensemble model saved to Pipeline(steps=[('tokenizer', CalamancyTokenizer()),
                ('lstm',
                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=LstmModel(
    (lstm): LSTM(200, 300, batch_first=True)
    (fc): Linear(in_features=300, out_features=2, bias=True)
  ),
))]).pkl


In [16]:
history_data_frame = pd.DataFrame(
    train_lstm['lstm'].history
).set_index('epoch')
history_data_frame.to_csv(f'{MODEL_FOLDER}/lstm_history.csv')

In [17]:
history_data_frame

Unnamed: 0_level_0,batches,train_batch_count,dur,train_loss,train_loss_best,event_cp
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[{'train_loss': 0.6942919492721558, 'train_bat...",1,2.845877,0.694292,True,True
2,"[{'train_loss': 1.2823944091796875, 'train_bat...",1,2.496768,1.282394,False,False
3,"[{'train_loss': 0.7073792815208435, 'train_bat...",1,2.640779,0.707379,False,False
4,"[{'train_loss': 1.2360115051269531, 'train_bat...",1,2.503999,1.236012,False,False
5,"[{'train_loss': 1.136793851852417, 'train_batc...",1,2.490753,1.136794,False,False
6,"[{'train_loss': 0.7182198166847229, 'train_bat...",1,2.638225,0.71822,False,False
7,"[{'train_loss': 0.741176426410675, 'train_batc...",1,2.46818,0.741176,False,False
8,"[{'train_loss': 0.7419273257255554, 'train_bat...",1,2.662333,0.741927,False,False
9,"[{'train_loss': 0.689807653427124, 'train_batc...",1,2.481492,0.689808,True,True
10,"[{'train_loss': 0.7376768589019775, 'train_bat...",1,2.546618,0.737677,False,False


In [18]:
metrics_data_frame = pd.DataFrame([{
  'accuracy': accuracy,
  'recall': recall,
  'precision': precision,
  'f1': f1,
}])
metrics_data_frame.to_csv(f'{MODEL_FOLDER}/lstm_metrics.csv')