In [20]:
from src import Bayes, LSTM, BERT
import pandas as pd
import numpy as np
from torchinfo import summary
import calamancy
from torch import nn, optim
# pd.set_option("display.max_rows", None, "display.max_columns", None)

## Loading Dataset

In [2]:
def read_csv_file(filename: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(filename, lineterminator='\n', usecols=range(2))
        print("CSV file read successfully!")
        return data
    except FileNotFoundError:
        print("ERROR: File not found")
        exit(1)

# Demonstrate reading a CSV file (use a sample or mock filename)
dataset = read_csv_file('datasets/datasetall.csv')
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


## Bernoulli Naive Bayes

In [3]:
Bayes.BayesPipeline

In [4]:
X_transformed = Bayes.Vectorizer.fit_transform(dataset['text'])
X_transformed.shape

(28461, 45495)

In [5]:
print(X_transformed)

  (0, 43379)	0.13178689068661975
  (0, 31443)	0.5388134203695595
  (0, 23610)	0.5185653725531368
  (0, 18121)	0.44697921344060404
  (0, 32276)	0.4496542080347992
  (0, 4462)	0.14636578770413514
  (1, 28512)	0.21288095794996578
  (1, 31550)	0.2769724439397788
  (1, 20759)	0.29877148732767217
  (1, 19296)	0.16419572324698758
  (1, 20909)	0.23668192928561657
  (1, 21115)	0.29677534636353303
  (1, 5655)	0.22354992040181504
  (1, 36545)	0.21890024270302047
  (1, 19926)	0.20591348758555314
  (1, 3314)	0.47616350586093165
  (1, 44376)	0.34343504175037554
  (1, 40788)	0.2461361603227233
  (1, 13180)	0.24969865122310905
  (1, 43379)	0.09871878450695029
  (2, 6005)	0.2665342489104476
  (2, 29363)	0.1351851916353869
  (2, 12428)	0.20527267264361812
  (2, 42381)	0.2686854388647266
  (2, 8459)	0.22228736405145857
  :	:
  (28459, 20259)	0.18278553642286766
  (28459, 18150)	0.22172511696327096
  (28459, 10001)	0.20122631198962063
  (28459, 943)	0.1977341288289347
  (28459, 38994)	0.1520373289623922
 

In [6]:
pd.DataFrame(
    zip(Bayes.Vectorizer.get_feature_names_out(), Bayes.Vectorizer.idf_),
    columns=['feature', 'IDF'],
)

Unnamed: 0,feature,IDF
0,00,8.953740
1,000,9.310415
2,000php,10.563178
3,002,10.563178
4,004,10.563178
...,...,...
45490,zyx,10.563178
45491,zzaj,10.563178
45492,zzz,9.870031
45493,zzzz,10.563178


In [7]:
Bayes.BayesModel.fit(X_transformed, dataset['label'])

## LSTM

In [8]:
LSTM.LstmPipeline

In [9]:
summary(LSTM.LstmModel(), input_size=(1000, 200))

Layer (type:depth-idx)                   Output Shape              Param #
LstmModel                                [1000, 2]                 --
├─LSTM: 1-1                              [1000, 400]               2,246,400
├─Linear: 1-2                            [1000, 2]                 802
Total params: 2,247,202
Trainable params: 2,247,202
Non-trainable params: 0
Total mult-adds (G): 898.56
Input size (MB): 0.80
Forward/backward pass size (MB): 3.22
Params size (MB): 8.99
Estimated Total Size (MB): 13.00

### CalamanCy for tokenization and vectorization

In [10]:
Calamancy = calamancy.load("tl_calamancy_md-0.1.0")
Calamancy



<spacy.lang.tl.Tagalog at 0x7fbfab2ba520>

In [17]:
tokenized_text = [
    text for text in
    Calamancy.pipe(dataset['text'][:20])
]

pd.DataFrame(tokenized_text)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,Binay,:,Patuloy,ang,kahirapan,dahil,sa,maling,pamamahala,[,...,,,,,,,,,,
1,SA,GOBYERNONG,TAPAT,WELCOME,SA,BAGUO,ANG,LAHAT,!,Kulay,...,,,,,,,,,,
2,wait,so,ur,telling,me,Let,Leni,Lead,mo,pero,...,,,,,,,,,,
3,,[,USERNAME]wish,this,is,just,a,nightmare,that,could,...,,,,,,,,,,
4,doc,willie,ong,and,isko,sabunutan,po,,,,...,,,,,,,,,,
5,Jeonghan,said,Let,Leni,Lead,!,!,Lenihae,ahahhdjfj,,...,,,,,,,,,,
6,[,USERNAME,],[,USERNAME]and[USERNAME,],Hahahhahaha,SMNIpresidentialdebateLeni,Mas,Duwag,...,,,,,,,,,,
7,[,USERNAME,],Attendees,of,the,CNNPHVPDebate,are,Walden,Bello,...,,,,,,,,,,
8,Abscbn,News,Hoaxnews,ng,abias,cbend,baklas,poster,ng,mmda,...,,,,,,,,,,
9,NEWS,UPDATE,:,The,camp,of,VP,Leni,Robredo,says,...,is,unnecessary,.,'[USERNAME,],,,,,


In [19]:
calamancy_result = [
    text.vector.reshape(1, -1)
    for text 
    in tokenized_text
]

# Concatenate all of them to form tensors of the right
# output shape.
calamancy_result = np.concatenate(calamancy_result)

pd.DataFrame(calamancy_result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.079959,0.030824,0.023937,0.077056,-0.016278,-0.213464,-0.033583,-0.129466,-0.295509,-0.336769,...,0.063488,0.263103,0.012206,-0.029511,0.423898,-0.299389,-0.295374,-0.069985,0.036437,-0.282058
1,-0.0984,-0.002249,0.276727,0.05135,-0.031479,0.004027,0.136961,-0.325066,0.048407,-0.353043,...,0.059415,0.34863,-0.128219,0.041348,0.014189,-0.230255,0.006952,-0.169405,0.15369,-0.381741
2,0.036431,-0.158402,0.274495,0.142066,0.137893,-0.024042,0.365713,-0.213699,0.030511,-0.351944,...,0.057017,0.135505,-0.285796,0.095381,0.037737,-0.077047,-0.194688,-0.001906,0.014809,-0.055193
3,-0.020916,-0.041349,0.302016,0.177881,0.241347,-0.235771,0.204203,-0.336757,-0.018283,-0.492876,...,0.119083,0.039149,-0.086413,-0.021435,-0.049193,-0.058399,-0.15771,0.141402,-0.201925,-0.075786
4,-0.119516,0.196659,-0.198787,-0.077372,0.622887,-0.081074,0.503828,-0.049475,0.07605,-0.174211,...,0.419373,0.268442,-0.197731,0.094939,0.392143,-0.017123,-0.237679,0.04952,-0.020215,-0.104863
5,-0.029591,-0.24493,0.09766,0.02309,-0.122789,-0.055445,-0.089991,-0.373714,0.079334,-0.225347,...,0.234361,0.270526,-0.221429,0.279023,0.466654,0.074242,-0.063258,0.185144,-0.041592,-0.270013
6,-0.290399,-0.052767,0.052822,0.07329,0.064098,-0.028226,-0.213334,-0.432583,0.00668,-0.681629,...,-0.039767,0.452986,0.135093,0.158477,0.552461,-0.567329,-0.951275,-0.06395,0.257712,-0.375186
7,0.014622,0.166466,0.225856,-0.080627,0.253168,-0.072141,0.085718,-0.387084,-0.023231,-0.415185,...,-0.031626,0.320854,0.04147,-0.10703,-0.05927,-0.073103,-0.036401,0.20645,-0.137248,-0.066766
8,-0.086514,0.060733,0.041636,-0.081284,0.082741,-0.125557,0.385665,-0.090126,-0.218261,-0.278023,...,0.174407,0.053065,-0.171381,-0.046194,0.228741,0.066286,-0.055473,0.08317,-0.100637,-0.069718
9,0.176639,-0.097223,0.090801,-0.032142,0.196946,-0.002278,0.253257,-0.208191,-0.029428,-0.372567,...,0.121507,0.138231,-0.321459,0.004063,-0.005993,-0.124676,-0.146378,0.18169,-0.175482,-0.205904


### LSTM Layer

In [22]:
LSTM_INPUT_SIZE = 200
LSTM_HIDDEN_SIZE = 400
LSTM_OUTPUT_SIZE = 2
LSTM_NUM_LAYERS = 2

lstm_layer = nn.LSTM(
    LSTM_INPUT_SIZE,
    LSTM_HIDDEN_SIZE,
    LSTM_NUM_LAYERS,
    batch_first=True,
)

lstm_layer

LSTM(200, 400, num_layers=2, batch_first=True)

In [24]:
lstm_output, lstm_hidden = lstm_layer(calamancy_result)

AttributeError: 'numpy.ndarray' object has no attribute 'dim'

### Linear Layer

In [23]:
linear_layer = nn.Linear(LSTM_HIDDEN_SIZE, LSTM_OUTPUT_SIZE)

linear_layer

Linear(in_features=400, out_features=2, bias=True)