# Simple Ensembling

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [4]:
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from transformers import DistilBertForSequenceClassification, XLNetForSequenceClassification, DistilBertTokenizer, XLNetTokenizer
import torch
import numpy as np
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
# Load pretrained models
distilbert_model = DistilBertForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/Models/DistilBERT/gossicop_nlr_7a")
xlnet_model = XLNetForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/Models/XLNet/gossicop_nlr_7a")


In [7]:
# Tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [8]:
# Load dataset
df = pd.read_csv("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/gossipcop_cleaned.csv")
df['SECTION_CLEANED'] = df['SECTION_CLEANED'].astype(str)

# Randomly select  % of the data
df_data = df.sample(frac=1.0, random_state=42)


# Split the dataset into training, validation, and test sets
# First, split into training and temp sets (80% training, 20% temp)
train_df, temp_df = train_test_split(df_data, test_size=0.20, random_state=42)

# Then, split the temp set into validation and test sets (50% validation, 50% test)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optionally,reset the index of the DataFrames
train_df = train_df.reset_index(drop=True)
#validation_df = validation_df1.sample(frac=0.50, random_state=42)

validation_df = validation_df.reset_index(drop=True)

#test_df = test_df1.sample(frac=0.50, random_state=42)
test_df = test_df.reset_index(drop=True)

In [9]:
y_val = list(validation_df['label'])
len(y_val)

2214

In [10]:
X_val = list(validation_df['SECTION_CLEANED'])

In [11]:
# Prepare input data
def preprocess_text(text, tokenizer):
    input_data = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    return input_data


In [12]:
X_val_distilbert = [preprocess_text(text, distilbert_tokenizer) for text in X_val]
X_val_xlnet = [preprocess_text(text, xlnet_tokenizer) for text in X_val]


In [13]:
X_val_distilbert

[{'input_ids': tensor([[  101,  9933,  1116,  1894, 10797,  1504,  3969,  1436, 18380,  1174,
           2851,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101, 10822,  4108, 21784,  6750, 13003,  1116,  1731,  1681,  4108,
          21784, 18149,  1174, 12120, 12198,  2093,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101,  7277,  1348,  1667,   140,  2858, 18066,  7098,  2247, 23833,
            102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101,   155,  3048,  2346,  2064,  3048,  2537,  7156,  9450, 20164,
          22309, 24976,  1116,  3929,  1258,  3341,   139, 15243, 18480,  4302,
          20777, 17510,  1116,  3921,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1]])},
 {'input_ids': tensor([[  101, 19834,   157,  8167, 13448,

In [14]:
X_val_xlnet

[{'input_ids': tensor([[16510,    23,  1170,  7998,  3349,  3063,   252, 14215,    68,  1795,
              4,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[17097,  1292, 12979,  5000, 14004,    23,  1134,   954,  1292, 12979,
           7664,  1717,  7589,  1138,     4,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[24993,   984, 31947,  5189,  1148, 31224,     4,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[   17, 25951, 12190,   874,  2688, 11762, 18154, 12216,  6257,    17,
             83, 10254,    23,  3606,   320,  2116, 20626,  4225,   117,  7245,
            741,  4288,    23,  4442,     4,     3]]), 'token_type_ids': t

In [15]:
# Inference
def get_predictions(model, input_data):
    with torch.no_grad():
        output = model(**input_data)
    logits = output.logits
    probabilities = torch.softmax(logits, dim=1)
    return probabilities


In [16]:
distilbert_predictions = [get_predictions(distilbert_model, data) for data in X_val_distilbert]
xlnet_predictions = [get_predictions(xlnet_model, data) for data in X_val_xlnet]


In [17]:
distilbert_predictions

[tensor([[0.0020, 0.9980]]),
 tensor([[0.0015, 0.9985]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0018, 0.9982]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0018, 0.9982]]),
 tensor([[0.0837, 0.9163]]),
 tensor([[0.0015, 0.9985]]),
 tensor([[0.0015, 0.9985]]),
 tensor([[0.9951, 0.0049]]),
 tensor([[0.9795, 0.0205]]),
 tensor([[0.0115, 0.9885]]),
 tensor([[0.0015, 0.9985]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.9879, 0.0121]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0018, 0.9982]]),
 tensor([[0.0028, 0.9972]]),
 tensor([[0.0022, 0.9978]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0017, 0.9983]]),
 tensor([[0.9966, 0.0034]]),
 tensor([[0.0063, 0.9937]]),
 tensor([[0.0249, 0.9751]]),
 tensor([[0.0017, 0.9983]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0027, 0.9973]]),
 tensor([[0.0023, 0.9977]]),
 tensor([[0.9921, 0.0079]]),
 tensor([[0.0016, 0.9984]]),
 tensor([[0.0071, 0.9929]]),
 tensor([[0.0018, 0.9982]]),
 tensor([[0.9962, 0.0038]]),
 tensor([[0.00

In [18]:
xlnet_predictions

[tensor([[0.0168, 0.9832]]),
 tensor([[0.0170, 0.9830]]),
 tensor([[0.0180, 0.9820]]),
 tensor([[0.0170, 0.9830]]),
 tensor([[0.9383, 0.0617]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0168, 0.9832]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.9384, 0.0616]]),
 tensor([[0.9225, 0.0775]]),
 tensor([[0.0178, 0.9822]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.9377, 0.0623]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0181, 0.9819]]),
 tensor([[0.0178, 0.9822]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.9381, 0.0619]]),
 tensor([[0.0466, 0.9534]]),
 tensor([[0.0208, 0.9792]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.8779, 0.1221]]),
 tensor([[0.7520, 0.2480]]),
 tensor([[0.9046, 0.0954]]),
 tensor([[0.0192, 0.9808]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.0169, 0.9831]]),
 tensor([[0.9380, 0.0620]]),
 tensor([[0.01

In [19]:
def ensemble_predictions(predictions1, predictions2):
    ensemble_probs = (predictions1 + predictions2) / 2.0  # Simple average ensemble
    return ensemble_probs

ensembled_probabilities = [ensemble_predictions(distilbert_probs, xlnet_probs) for distilbert_probs, xlnet_probs in zip(distilbert_predictions, xlnet_predictions)]


In [20]:
final_predictions = [torch.argmax(ensemble_probs, dim=1).item() for ensemble_probs in ensembled_probabilities]

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, final_predictions)
precision = precision_score(y_val, final_predictions)
recall = recall_score(y_val, final_predictions)
f1 = f1_score(y_val, final_predictions)

In [22]:
print(f"Accuracy: {accuracy*100:.4f}")
print(f"Precision: {precision*100:.4f}")
print(f"Recall: {recall*100:.4f}")
print(f"F1 Score: {f1*100:.4f}")

Accuracy: 84.7787
Precision: 87.6767
Recall: 92.8743
F1 Score: 90.2006


In [23]:
import numpy as np

def majority_vote(predictions1, predictions2):
    predictions1 = [np.argmax(probs, axis=1) for probs in predictions1]
    predictions2 = [np.argmax(probs, axis=1) for probs in predictions2]
    ensemble_predictions = [int(np.median([p1, p2])) for p1, p2 in zip(predictions1, predictions2)]
    return ensemble_predictions


In [24]:
# Convert PyTorch tensors to NumPy arrays
distilbert_predictions_np = [probs.numpy() for probs in distilbert_predictions]
xlnet_predictions_np = [probs.numpy() for probs in xlnet_predictions]


In [25]:
# Perform majority vote ensembling
ensemble_predictions = majority_vote(distilbert_predictions_np, xlnet_predictions_np)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, ensemble_predictions)
precision = precision_score(y_val, ensemble_predictions)
recall = recall_score(y_val, ensemble_predictions)
f1 = f1_score(y_val, ensemble_predictions)

In [27]:
print(f"Accuracy: {accuracy*100:.4f}")
print(f"Precision: {precision*100:.4f}")
print(f"Recall: {recall*100:.4f}")
print(f"F1 Score: {f1*100:.4f}")

Accuracy: 83.9657
Precision: 90.3622
Recall: 88.1437
F1 Score: 89.2392


In [28]:
def weighted_ensemble(predictions1, predictions2, weight_model1, weight_model2):
    predictions1 = [np.argmax(probs, axis=1) for probs in predictions1]
    predictions2 = [np.argmax(probs, axis=1) for probs in predictions2]
    ensemble_probs = [int((weight_model1 * p1 + weight_model2 * p2) / (weight_model1 + weight_model2)) for p1, p2 in zip(predictions1, predictions2)]
    return ensemble_probs


In [29]:
# Define the weights for each model
weight_model1 = 0.6
weight_model2 = 0.4


In [30]:
# Perform  weight ensembling
ensemble_predictions_weight = weighted_ensemble(distilbert_predictions_np, xlnet_predictions_np,weight_model1,weight_model2)

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, ensemble_predictions_weight)
precision = precision_score(y_val, ensemble_predictions_weight)
recall = recall_score(y_val, ensemble_predictions_weight)
f1 = f1_score(y_val, ensemble_predictions_weight)

In [32]:
print(f"Accuracy: {accuracy*100:.4f}")
print(f"Precision: {precision*100:.4f}")
print(f"Recall: {recall*100:.4f}")
print(f"F1 Score: {f1*100:.4f}")

Accuracy: 83.9657
Precision: 90.3622
Recall: 88.1437
F1 Score: 89.2392


In [33]:
# Define the weights for each model
weight_model1 = 0.8
weight_model2 = 0.2

In [34]:
# Perform  weight ensembling
ensemble_predictions_weight = weighted_ensemble(distilbert_predictions_np, xlnet_predictions_np,weight_model1,weight_model2)

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, ensemble_predictions_weight)
precision = precision_score(y_val, ensemble_predictions_weight)
recall = recall_score(y_val, ensemble_predictions_weight)
f1 = f1_score(y_val, ensemble_predictions_weight)

In [36]:
print(f"Accuracy: {accuracy*100:.4f}")
print(f"Precision: {precision*100:.4f}")
print(f"Recall: {recall*100:.4f}")
print(f"F1 Score: {f1*100:.4f}")

Accuracy: 83.9657
Precision: 90.3622
Recall: 88.1437
F1 Score: 89.2392


# Stacking

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m174.1/258.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from transformers import DistilBertForSequenceClassification, XLNetForSequenceClassification, DistilBertTokenizer, XLNetTokenizer
from sklearn.linear_model import LogisticRegression
import torch
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


class TransformersWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name_or_path):
        self.model_name_or_path = model_name_or_path

    def fit(self, X, y):
        # Tokenize and preprocess the input data
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertForSequenceClassification.from_pretrained(self.model_name_or_path)

        X_encoded = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
        input_ids = X_encoded['input_ids']

        # Convert labels to tensor
        y_tensor = torch.tensor(y)

        # Train the model (this may vary depending on your specific model and training process)
        outputs = model(input_ids, labels=y_tensor)
        loss, logits = outputs.loss, outputs.logits

        self.model = model

        return self

    def predict(self, X):
        # Check if the model is fitted
        check_is_fitted(self, 'model')

        # Tokenize and preprocess the input data
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        X_encoded = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
        input_ids = X_encoded['input_ids']

        # Make predictions
        with torch.no_grad():
            logits = self.model(input_ids).logits

        # Convert logits to class labels (assuming binary classification)
        predicted_labels = torch.argmax(logits, dim=1)

        return predicted_labels


In [None]:
distilbert_model = TransformersWrapper("/content/gdrive/MyDrive/fake_news/Models/DistilBERT/ds_35_h3")

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


class TransformersWrapperXL(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name_or_path):
        self.model_name_or_path = model_name_or_path

    def fit(self, X, y):
        # Tokenize and preprocess the input data
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetForSequenceClassification.from_pretrained(self.model_name_or_path)

        X_encoded = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
        input_ids = X_encoded['input_ids']

        # Convert labels to tensor
        y_tensor = torch.tensor(y)

        # Train the model (this may vary depending on your specific model and training process)
        outputs = model(input_ids, labels=y_tensor)
        loss, logits = outputs.loss, outputs.logits

        self.model = model

        return self

    def predict(self, X):
        # Check if the model is fitted
        check_is_fitted(self, 'model')

        # Tokenize and preprocess the input data
        tokenizer = XLNetTokenizer.from_pretrained('distilbert-base-cased')
        X_encoded = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
        input_ids = X_encoded['input_ids']

        # Make predictions
        with torch.no_grad():
            logits = self.model(input_ids).logits

        # Convert logits to class labels (assuming binary classification)
        predicted_labels = torch.argmax(logits, dim=1)

        return predicted_labels


In [None]:
xlnet_model = TransformersWrapperXL("/content/gdrive/MyDrive/fake_news/Models/XLNet/ds_35")

In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/fake_news/dfcleaned.csv")
df['SECTION_CLEANED'] = df['SECTION_CLEANED'].astype(str)

# Randomly select  % of the data
df_data = df.sample(frac=0.35, random_state=42)

In [None]:
labels = list(df_data['label'])
len(labels)

13526

In [None]:
labels = [0 if label == "FAKE" else 1 for label in labels]
len(labels)

13526

In [None]:
# Assuming you have labeled data for training and testing
X_train, X_test, y_train, y_test = train_test_split(list(df_data['SECTION_CLEANED']), labels, test_size=0.2,random_state=42)

In [None]:
base_models = [
    ('well_trained_model', distilbert_model),
    ('overfitting_model', xlnet_model)
]

In [None]:
meta_learner = LogisticRegression()

In [None]:
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner)


In [None]:
stacking_model.fit(X_train, y_train)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
y_pred = stacking_model.predict(X_test)

In [None]:
# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)



In [None]:
# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)