In [1]:
import joblib
from src import Utils, EnsembleFunctions
import pandas as pd
import numpy as np

In [2]:
import torch
torch.cuda.empty_cache()
torch.cuda.is_available = lambda: False

In [3]:
Utils.seed_random_number_generators()

Random number generators seeded.


In [4]:
def load_model(model_name: str):
  try:
    return joblib.load(model_name)
  except FileNotFoundError:
    print("ERROR: Model not found")
    return None

learner_bayes = load_model('model_bayes/Bayes.pkl')
learner_lstm = load_model('model_lstm/LSTM.pkl')
learner_bert = load_model('model_bert/mBERT.pkl')

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
learner_bayes

In [6]:
learner_lstm

In [7]:
learner_bert

In [8]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()

logistic_regression

In [10]:
dataset = Utils.read_csv_file('datasets/datasetall.csv')

CSV file read successfully!


In [11]:
TEST_SIZE = 0.1

X_train, X_test, y_train, y_test = Utils.get_train_test_split(dataset, TEST_SIZE)

In [12]:
X_train

0        pag hindi nanalo si Norberto Gonzales pwede ba...
1        Ngayon lang ako super proud sa PRESIDENTE na i...
2        JUST SAW SOMEONE CALL BBM BLENGBLONG HAHAHAHAH...
3        Rep. Binay on her leadership style: I am very ...
4        Liwanag o dilim? May oras pa. Kakampink Leni L...
                               ...                        
25611    "Kala ko wala andito pala si Marcos."*pertaini...
25612    cathy [USERNAME] Dec [USERNAME] parang tanga i...
25613                             Nognog+pandak= BINAY ftw
25614    BINAY:Did your enormous wealth all come from y...
25615                                Uunlad tayo kay Binay
Name: text, Length: 25616, dtype: object

In [None]:
y_train

In [None]:
from src import EnsembleFunctions

predictions = EnsembleFunctions.concatenate_predict_proba(
  learner_bayes,
  learner_lstm,
  learner_bert,
  X_train,
)

predictions

In [35]:
from sklearn.ensemble import StackingClassifier

ensemble = StackingClassifier(
    estimators=[
        ('bayes', learner_bayes),
        ('lstm', learner_lstm),
        ('bert', learner_bert),
    ],
    final_estimator=logistic_regression,
    cv='prefit',
)

ensemble

In [9]:
dataset = Utils.read_csv_file('datasets/datasetall.csv')

dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [None]:
ensemble.fit(X_train, y_train)

In [29]:
soft_voting_results_discrete = np.argmax(soft_voting_results, axis=1)

soft_voting_results_discrete

array([1, 1, 0, ..., 1, 1, 0])

In [27]:
pd.DataFrame([{
  'accuracy': accuracy_score(soft_voting_results_discrete, y_test),
  'recall': recall_score(soft_voting_results_discrete, y_test),
  'precision': precision_score(soft_voting_results_discrete, y_test),
  'f1': f1_score(soft_voting_results_discrete, y_test),
}])

Unnamed: 0,accuracy,recall,precision,f1
0,0.846397,0.821821,0.887727,0.853503


In [31]:
stacking_results = EnsembleFunctions.stacking(
  learner_bayes,
  learner_lstm,
  learner_bert,
  learner_lr,
  X_test
)

stacking_results

array([[0.0935605 , 0.9064395 ],
       [0.07668313, 0.92331687],
       [0.98238279, 0.01761721],
       ...,
       [0.05485927, 0.94514073],
       [0.05103099, 0.94896901],
       [0.96259484, 0.03740516]])

In [32]:
stacking_results_discrete = np.argmax(stacking_results, axis=1)

stacking_results_discrete

array([1, 1, 0, ..., 1, 1, 0])

In [33]:
pd.DataFrame([{
  'accuracy': accuracy_score(stacking_results_discrete, y_test),
  'recall': recall_score(stacking_results_discrete, y_test),
  'precision': precision_score(stacking_results_discrete, y_test),
  'f1': f1_score(stacking_results_discrete, y_test),
}])

Unnamed: 0,accuracy,recall,precision,f1
0,0.849912,0.846524,0.857741,0.852096
