##                  Ensemble with Transformers

**Connect to Google Drive**

In [2]:
#Connect to Drive
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load Data**

In [5]:
#Open Dataset
import pandas as pd

abspath_curr = '/content/drive/My Drive/NLP_Final_Exam/'
cola = pd.read_csv(abspath_curr + 'combined_cola.csv',header=0)
sst2 = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_sst2.csv',header=0)
wnli = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_wnli.csv',header=0)
rte = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_rte.csv',header=0)
qnli = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_qnli.csv',header=0)
wnli = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_wnli.csv',header=0)
mrpc = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_mrpc.csv',header=0)
stsb = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_stsb.csv',header=0)
qqp = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_qqp.csv',header=0)
mnli = pd.read_csv(abspath_curr + 'NLP - Ensemble Data - combined_mnli.csv',header=0)
print(cola.head())



   electra-0  electra-1   xlnet-0   xlnet-1    bert-0    bert-1  target
0  -3.015364   2.818316 -2.792709  3.361839 -2.782302  3.292792       1
1  -3.272816   3.013487 -3.019989  3.219829 -2.536592  3.144536       1
2  -2.484975   2.380282 -1.700235  1.687799 -2.348618  2.650339       1
3  -3.138503   2.965038 -2.994277  3.023010 -2.814234  3.324904       1
4  -2.365048   2.197786 -2.657328  2.292092  2.783062 -3.252907       0


**Ensemble Functions (Logistic Regression and RandomForestClassifier)**

In [6]:
#Import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression

#logistic regression function to train and make predictions
def LR(data): 

  X_train, X_test, y_train, y_test = train_test_split(data.drop('target',axis=1), 
                                                      data['target'], test_size=0.20,random_state=2000) 
  model = LogisticRegression(solver='liblinear', random_state=0).fit(X_train, y_train)
  pred = model.predict(X_test)
  return pred, y_test

#RandomForestClassifer function to train and make predictions
def RFC(data):
  
  X_train, X_test, y_train, y_test = train_test_split(data.drop('target',axis=1), 
                                                      data['target'], test_size=0.20,random_state=2000) 
  clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
  pred_clf = clf.predict(X_test)
  return pred_clf, y_test


def LinReg(data):

  X_train, X_test, y_train, y_test = train_test_split(data.drop('target',axis=1), 
                                                      data['target'], test_size=0.20,random_state=2000) 
  model = LinearRegression().fit(X_train, y_train)
  pred = model.predict(X_test)
  return pred, y_test


**Ensemble for CoLA (Corpus of Linguistic Acceptability) using:**

Metric - Matthew's Correlation Coefficient

*   ELECTRA (google/electra discriminator) - Metric: 0.67738
*   XLNet (xlnet-base-cased) - Metric: 0.4676
*   BERT (bert-base-cased) - Metric: 0.5955

*Ensemble Metric:* 0.7404







In [7]:
#CoLA Classification Report using Logistic Regression
lr_cola_pred, lr_cola_y_test = LR(cola)
print(classification_report(lr_cola_y_test,lr_cola_pred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80        60
           1       0.91      0.93      0.92       149

    accuracy                           0.89       209
   macro avg       0.87      0.86      0.86       209
weighted avg       0.89      0.89      0.89       209



In [8]:
#CoLA Mathew's Correlation Coefficient using Logistic Regression Predictions 
print(f"Using Logistic Regression Ensemble: \n\nMatthew's Correlation Coefficient: {matthews_corrcoef(lr_cola_pred, lr_cola_y_test)}")

Using Logistic Regression Ensemble: 

Matthew's Correlation Coefficient: 0.7275377298784709


In [9]:
#CoLA Classification Report using RandomForestClassifier
rfc_cola_pred, rfc_cola_y_test = RFC(cola) 
print(classification_report(rfc_cola_y_test, rfc_cola_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.81        60
           1       0.92      0.93      0.93       149

    accuracy                           0.89       209
   macro avg       0.87      0.87      0.87       209
weighted avg       0.89      0.89      0.89       209



In [10]:
#CoLA Mathew's Correlation Coefficient using RandomForestClassifier Predictions 
print(f"Using RandomForestClassifer Ensemble: \n\nMatthew's Correlation Coefficient: {matthews_corrcoef(rfc_cola_pred, rfc_cola_y_test)}")

Using RandomForestClassifer Ensemble: 

Matthew's Correlation Coefficient: 0.7404618008434444


**Ensemble for SST-2 (Stanford Sentiment Treebank) using:**

Metric - Accuracy Score

*   ELECTRA (google/electra discriminator) - Metric: 0.9495
*   XLNet (xlnet-base-cased) - Metric: 0.9380
*   BERT (bert-base-cased) - Metric: 0.9220

*Ensemble Score:* 0.9485


In [11]:
#SST2 Classification Report using Logistic Regression
lr_sst2_pred, lr_sst2_y_test = LR(sst2) 
print(classification_report(lr_sst2_y_test, lr_sst2_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95        88
           1       0.94      0.95      0.95        87

    accuracy                           0.95       175
   macro avg       0.95      0.95      0.95       175
weighted avg       0.95      0.95      0.95       175



In [12]:
#SST2 Accuracy Score using Logistic Regression Predictions
print(f"Using Logistic Regression Ensemble: \n\nAccuracy Score: {accuracy_score(lr_sst2_y_test, lr_sst2_pred)}")

Using Logistic Regression Ensemble: 

Accuracy Score: 0.9485714285714286


In [13]:
#SST2 Classification Report using RandomForestClassificatier Predictions 
rfc_sst2_pred, rfc_sst2_y_test = RFC(sst2) 
print(classification_report(rfc_sst2_y_test, rfc_sst2_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        88
           1       0.93      0.95      0.94        87

    accuracy                           0.94       175
   macro avg       0.94      0.94      0.94       175
weighted avg       0.94      0.94      0.94       175



In [14]:
#SST-2 Accuracy Score using Random Forest Classifier
print(f"Using RandomForestClassifier Ensemble: \n\nAccuracy Score: {accuracy_score(rfc_sst2_y_test, rfc_sst2_pred)}")

Using RandomForestClassifier Ensemble: 

Accuracy Score: 0.9428571428571428


**Ensemble for MRPC (Microsoft Research Paraphrase Corpus) using:**

Metric - F1/Accuracy

*   ELECTRA (google/electra discriminator) - Metric: 0.9145/0.8823
*   XLNet (xlnet-base-cased) - Metric: 0.9135/0.8799
*   BERT (bert-base-cased) - Metric: 0.9015/0.8602

Ensemble Metric: 0.9268/0.9491

In [15]:
#MRPC Classification Report using Logistic Regression
lr_mrpc_pred, lr_mrpc_y_test = LR(mrpc) 
print(classification_report(lr_mrpc_y_test, lr_mrpc_pred))

              precision    recall  f1-score   support

           0       0.95      0.80      0.87        25
           1       0.92      0.98      0.95        57

    accuracy                           0.93        82
   macro avg       0.94      0.89      0.91        82
weighted avg       0.93      0.93      0.92        82



In [16]:
#MRPC Accuracy/F1 Score using Logistic Regression Predictions
lr_mrpc_acc = accuracy_score(lr_mrpc_y_test, lr_mrpc_pred)
lr_mrpc_f1 = f1_score(lr_mrpc_y_test, lr_mrpc_pred)
print(f"Using Logistic Regression Predictions: \n\nAccuracy Score: {lr_mrpc_acc} \nF1 Score: {lr_mrpc_f1}")

Using Logistic Regression Predictions: 

Accuracy Score: 0.926829268292683 
F1 Score: 0.9491525423728813


In [36]:
#MRPC Classification Report using RandomForestClassificatier Predictions 
rfc_mrpc_pred, rfc_mrpc_y_test = RFC(mrpc) 
print(classification_report(rfc_mrpc_y_test, rfc_mrpc_pred))

              precision    recall  f1-score   support

           0       0.95      0.80      0.87        25
           1       0.92      0.98      0.95        57

    accuracy                           0.93        82
   macro avg       0.94      0.89      0.91        82
weighted avg       0.93      0.93      0.92        82



In [18]:
#MRPC Accuracy/F1 Score using RandomForestClassifier Predictions
rfc_mrpc_acc = accuracy_score(rfc_mrpc_y_test, rfc_mrpc_pred)
rfc_mrpc_f1_score = f1_score(rfc_mrpc_y_test, rfc_mrpc_pred)
print(f"Using RandomForestClassifier Predictions: \n\nAccuracy Score: {rfc_mrpc_acc} \nF1 Score: {rfc_mrpc_f1_score}")

Using RandomForestClassifier Predictions: 

Accuracy Score: 0.926829268292683 
F1 Score: 0.9491525423728813


**Ensemble for STSB (The Semantic Textual Similarity Benchmark) using:**

Metric - Pearson/Spearman Correlation Coefficient

*   ELECTRA (google/electra discriminator) - Metric: 0.9074/0.9057
*   XLNet (xlnet-base-cased) - Metric: 0.8855/0.8834
*   BERT (bert-base-cased) - Metric: 0.8519/0.8513

Ensemble Metric: 0.9094/0.9057

In [19]:
from scipy import stats

#STSB Pearson-Spearman Correlation Coefficient using Linear Regression
linreg_stsb_pred, linreg_stsb_y_test = LinReg(stsb)
stats.spearmanr(linreg_stsb_pred, linreg_stsb_y_test)

SpearmanrResult(correlation=0.9057440226841488, pvalue=4.0624901046367654e-113)

In [37]:
#STSB Pearson-Spearman Correlation Coefficient using Linear Regression
linreg_stsb_pred, linreg_stsb_y_test = LinReg(stsb)
stats.pearsonr(linreg_stsb_pred, linreg_stsb_y_test)

(0.9094435738640109, 1.3850936154113772e-115)

**Ensemble for QQP (Quora Question Pair) using:**

Metric - Accuracy/F1

*   ELECTRA (google/electra discriminator) - Metric: 0.9179/0.8906
*   XLNet (xlnet-base-cased) - Metric: 0.9099/0.8793
*   BERT (bert-base-cased) - Metric: 0.908/0.8758

Ensemble Metric: 0.9241/0.9011

In [20]:
#QQP Classification Report using Logistic Regression
lr_qqp_pred, lr_qqp_y_test = LR(qqp) 
print(classification_report(lr_qqp_y_test, lr_qqp_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5013
           1       0.89      0.91      0.90      3073

    accuracy                           0.92      8086
   macro avg       0.92      0.92      0.92      8086
weighted avg       0.92      0.92      0.92      8086



In [21]:
#QQP Accuracy/F1 Score using Logistic Regression Predictions
lr_qqp_acc = accuracy_score(lr_qqp_y_test, lr_qqp_pred)
lr_qqp_f1 = f1_score(lr_qqp_y_test, lr_qqp_pred)
print(f"Using Logistic Regression Predictions: \n\nAccuracy Score: {lr_qqp_acc} \nF1 Score: {lr_qqp_f1}")

Using Logistic Regression Predictions: 

Accuracy Score: 0.9241899579520159 
F1 Score: 0.9011449766166747


In [38]:
#QQP Classification Report using RandomForestClassificatier Predictions 
rfc_qqp_pred, rfc_qqp_y_test = RFC(qqp) 
print(classification_report(rfc_qqp_y_test, rfc_qqp_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5013
           1       0.89      0.91      0.90      3073

    accuracy                           0.92      8086
   macro avg       0.92      0.92      0.92      8086
weighted avg       0.92      0.92      0.92      8086



In [39]:
#QQP Accuracy/F1 Score using RandomForestClassifier Predictions
rfc_qqp_acc = accuracy_score(rfc_qqp_y_test, rfc_qqp_pred)
rfc_qqp_f1_score = f1_score(rfc_qqp_y_test, rfc_qqp_pred)
print(f"Using RandomForestClassifier Predictions: \n\nAccuracy Score: {rfc_qqp_acc} \nF1 Score: {rfc_qqp_f1_score}")

Using RandomForestClassifier Predictions: 

Accuracy Score: 0.9222112292851843 
F1 Score: 0.8989882768588405


**Ensemble for MNLI (The Multi-Genre Natural Language Inference Corpus) using:**

Metric - Accuracy Score

*   ELECTRA (google/electra discriminator) - Metric: 0.8867
*   XLNet (xlnet-base-cased) - Metric: 0.8646
*   BERT (bert-base-cased) - Metric: 0.8436

Ensemble Metric: 0.8922

In [31]:
#MNLI Classification Report using Logistic Regression
lr_mnli_pred, lr_mnli_y_test = LR(mnli) 
print(classification_report(lr_mnli_y_test, lr_mnli_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       715
           1       0.85      0.83      0.84       624
           2       0.91      0.92      0.91       628

    accuracy                           0.89      1967
   macro avg       0.89      0.89      0.89      1967
weighted avg       0.89      0.89      0.89      1967



In [32]:
#MNLI Accuracy Score using Logistic Regression
lr_mnli_acc = accuracy_score(lr_mnli_y_test, lr_mnli_pred)
print(f"Logistic Regression Ensemble: \n\nAccuracy Score: {lr_mnli_acc}")

Logistic Regression Ensemble: 

Accuracy Score: 0.8861209964412812


In [33]:
#MNLI Classification Report RandomForestClassificatier Predictions 
rfc_mnli_pred, rfc_mnli_y_test = RFC(mnli) 
print(classification_report(rfc_mnli_y_test, rfc_mnli_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90       715
           1       0.85      0.86      0.85       624
           2       0.92      0.92      0.92       628

    accuracy                           0.89      1967
   macro avg       0.89      0.89      0.89      1967
weighted avg       0.89      0.89      0.89      1967



In [34]:
#MNLI Accuracy Score using Random Forest Classification
rfc_mnli_acc = accuracy_score(rfc_mnli_y_test, rfc_mnli_pred)
print(f"Random Forest Classification Ensemble: \n\nAccuracy Score: {rfc_mnli_acc}")

Random Forest Classification Ensemble: 

Accuracy Score: 0.8922216573462125


**Ensemble for QNLI (The Stanford Question Answering Dataset) using:**

Metric - Accuracy Score

*   ELECTRA (google/electra discriminator) - Score: 0.9303
*   XLNet (xlnet-base-cased) - Score: 0.9202
*   BERT (bert-base-cased) - Score: 0.9044

Ensemble Metric: 0.9396

In [26]:
#QNLI Classification Report using Logistic Regression
lr_qnli_pred, lr_qnli_y_test = LR(qnli) 
print(classification_report(lr_qnli_y_test, lr_qnli_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       529
           1       0.95      0.93      0.94       564

    accuracy                           0.94      1093
   macro avg       0.94      0.94      0.94      1093
weighted avg       0.94      0.94      0.94      1093



In [27]:
#QNLI Accuracy Score using Logistic Regression
lr_qnli_acc = accuracy_score(lr_qnli_y_test, lr_qnli_pred)
print(f"Logistic Regression Ensemble: \n\nAccuracy Score: {lr_qnli_acc}")

Logistic Regression Ensemble: 

Accuracy Score: 0.938700823421775


In [28]:
#QNLI Classification Report RandomForestClassificatier Predictions 
rfc_qnli_pred, rfc_qnli_y_test = RFC(qnli) 
print(classification_report(rfc_qnli_y_test, rfc_qnli_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       529
           1       0.95      0.93      0.94       564

    accuracy                           0.94      1093
   macro avg       0.94      0.94      0.94      1093
weighted avg       0.94      0.94      0.94      1093



In [29]:
#QNLI Accuracy Score using Random Forest Classification
rfc_qnli_acc = accuracy_score(rfc_qnli_y_test, rfc_qnli_pred)
print(f"Random Forest Classification Ensemble: \n\nAccuracy Score: {rfc_qnli_acc}")

Random Forest Classification Ensemble: 

Accuracy Score: 0.939615736505032


**Ensemble for RTE (The Recognizing Textual Entailment) using:**

Metric - Accuracy Score

*   ELECTRA (google/electra discriminator) - Metric: 0.8122
*   XLNet (xlnet-base-cased) - Metric: 0.6642
*   BERT (bert-base-cased) - Metric: 0.5667

Ensemble Metric: 0.8392

In [None]:
#RTE Classification Report using Logistic Regression
lr_rte_pred, lr_rte_y_test = LR(rte) 
print(classification_report(lr_rte_y_test, lr_rte_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.84      0.87      0.86        31

    accuracy                           0.84        56
   macro avg       0.84      0.84      0.84        56
weighted avg       0.84      0.84      0.84        56



In [None]:
#RTE Accuracy Score using Logistic Regression
print(f"Logistic Regression Ensemble: \n\nAccuracy Score: {accuracy_score(lr_rte_y_test, lr_rte_pred)} ")

Logistic Regression Ensemble: 

Accuracy Score: 0.8392857142857143 


In [None]:
#RTE Classification Report using RandomForestClassificatier Predictions 
rfc_rte_pred, rfc_rte_y_test = RFC(rte) 
print(classification_report(rfc_rte_y_test, rfc_rte_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.87      0.84      0.85        31

    accuracy                           0.84        56
   macro avg       0.84      0.84      0.84        56
weighted avg       0.84      0.84      0.84        56



In [None]:
#RTE Accuracy Score using RandomForestClassifier
print(f"Random Forest Classifier Ensemble: \n\nAccuracy Score: {accuracy_score(rfc_rte_y_test, rfc_rte_pred)} ")

Random Forest Classifier Ensemble: 

Accuracy Score: 0.8392857142857143 


**Ensemble for WNLI (The Winograd Schema Challenge) using:**

Metric - Accuracy Score

*   ELECTRA (google/electra discriminator) - Metric: 0.56338
*   XLNet (xlnet-base-cased) - Metric: 0.5352
*   BERT (bert-base-cased) - Metric: 0.56338

Ensemble Metric: 0.4666

In [40]:
#WNLI Classification Report using Logistic Regression
lr_wnli_pred, lr_wnli_y_test = LR(wnli) 
print(classification_report(lr_wnli_y_test, lr_wnli_pred))

              precision    recall  f1-score   support

           0       0.47      1.00      0.64         7
           1       0.00      0.00      0.00         8

    accuracy                           0.47        15
   macro avg       0.23      0.50      0.32        15
weighted avg       0.22      0.47      0.30        15



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#WNLI Accuracy Score
wnli_lr_accuracy = accuracy_score(lr_wnli_y_test, lr_wnli_pred)
print(f"Accuracy Score: {wnli_lr_accuracy}")

Accuracy Score: 0.4666666666666667


In [None]:
#WNLI Classification Report using RandomForestClassificatier Predictions 
rfc_wnli_pred, rfc_wnli_y_test = RFC(wnli) 
print(classification_report(rfc_wnli_y_test, rfc_wnli_pred))

              precision    recall  f1-score   support

           0       0.46      0.86      0.60         7
           1       0.50      0.12      0.20         8

    accuracy                           0.47        15
   macro avg       0.48      0.49      0.40        15
weighted avg       0.48      0.47      0.39        15



In [None]:
#WNLI Accuracy Score
wnli_rfc_accuracy = accuracy_score(rfc_wnli_y_test, rfc_wnli_pred)
print(f"Accuracy Score: {wnli_rfc_accuracy}")

Accuracy Score: 0.4666666666666667
