In [2]:
pip install transformers



In [3]:
pip install sentencepiece



In [4]:
pip install accelerate -U



In [5]:
import torch
from transformers import XLNetForSequenceClassification, XLNetTokenizer
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
# Define your model and tokenizer
model_name = "xlnet-base-cased"
model = XLNetForSequenceClassification.from_pretrained(model_name)
tokenizer = XLNetTokenizer.from_pretrained(model_name)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Load dataset
df = pd.read_csv("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/gossipcop_cleaned.csv")
df['SECTION_CLEANED'] = df['SECTION_CLEANED'].astype(str)

# Randomly select  % of the data
df_data = df.sample(frac=1.0, random_state=42)


# Split the dataset into training, validation, and test sets
# First, split into training and temp sets (80% training, 20% temp)
train_df, temp_df = train_test_split(df_data, test_size=0.20, random_state=42)

# Then, split the temp set into validation and test sets (50% validation, 50% test)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optionally,reset the index of the DataFrames
train_df = train_df.reset_index(drop=True)
#validation_df = validation_df1.sample(frac=0.50, random_state=42)

validation_df = validation_df.reset_index(drop=True)

#test_df = test_df1.sample(frac=0.50, random_state=42)
test_df = test_df.reset_index(drop=True)

In [8]:
df.groupby("label").describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,word count,word count,word count,word count,word count,word count,word count,word count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,5323.0,19478.0,1536.762072,16817.0,18147.5,19478.0,20808.5,22139.0,5323.0,11.057862,3.917703,1.0,9.0,11.0,14.0,31.0
1,16817.0,8408.0,4854.794074,0.0,4204.0,8408.0,12612.0,16816.0,16817.0,11.299875,3.878651,1.0,9.0,11.0,14.0,39.0


In [9]:
train_df.groupby("label").describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,word count,word count,word count,word count,word count,word count,word count,word count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,4238.0,19482.26168,1539.210334,16817.0,18143.25,19498.5,20801.75,22139.0,4238.0,11.027843,3.943509,1.0,9.0,11.0,14.0,31.0
1,13474.0,8420.56212,4847.687893,0.0,4248.25,8413.5,12630.75,16816.0,13474.0,11.290931,3.869135,1.0,9.0,11.0,14.0,39.0


In [10]:
validation_df.groupby("label").describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,word count,word count,word count,word count,word count,word count,word count,word count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,544.0,19460.169118,1531.983748,16828.0,18178.0,19395.5,20911.25,22138.0,544.0,11.108456,3.826967,1.0,9.0,11.0,13.0,26.0
1,1670.0,8535.662275,4873.111339,1.0,4245.5,8712.0,12736.5,16813.0,1670.0,11.241916,3.917228,1.0,9.0,11.0,14.0,30.0


In [9]:
test_df.groupby("label").describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,word count,word count,word count,word count,word count,word count,word count,word count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,541.0,19462.545287,1524.906059,16832.0,18149.0,19379.0,20792.0,22131.0,541.0,11.242144,3.804792,2.0,9.0,11.0,14.0,28.0
1,1673.0,8179.393903,4889.282976,2.0,3915.0,8019.0,12395.0,16814.0,1673.0,11.429767,3.915984,1.0,9.0,11.0,14.0,38.0


In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,label,word count,SECTION_CLEANED
0,0,gossipcop-882573,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,1,12,Teen Mom Star Jenelle Evans Wedding Dress Is A...
1,1,gossipcop-875924,Kylie Jenner refusing to discuss Tyga on Life ...,1,10,Kylie Jenner refuse discuss Tyga Life Kylie
2,2,gossipcop-894416,Quinn Perkins,1,2,Quinn Perkins
3,3,gossipcop-857248,I Tried Kim Kardashian's Butt Workout & Am For...,1,10,I Tried Kim Kardashians Butt Workout Am Foreve...
4,4,gossipcop-884684,Celine Dion donates concert proceeds to Vegas ...,1,9,Celine Dion donate concert proceed Vegas shoot...


In [13]:
train_texts = list(train_df['SECTION_CLEANED'])
len(train_texts)

17712

In [14]:
train_labels = list(train_df['label'])
len(train_labels)

17712

In [15]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenized_data, labels):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

In [16]:
# Tokenize the dataset
train_encodings = tokenizer(train_texts, return_tensors="pt", padding=True, truncation=True,max_length=128)

In [17]:
train_labels = torch.tensor(train_labels)

In [18]:
train_dataset = CustomDataset(train_encodings, train_labels)

  self.labels = torch.tensor(labels)


In [19]:
val_texts = list(validation_df['SECTION_CLEANED'])
len(val_texts)

2214

In [20]:
val_labels = list(validation_df['label'])
len(val_labels)

2214

In [21]:
# Tokenize the dataset
val_encodings = tokenizer(val_texts, return_tensors="pt", padding=True, truncation=True,max_length=128)

In [22]:
val_labels = torch.tensor(val_labels)

In [23]:
# Define a custom dataset for validation
val_dataset = CustomDataset(val_encodings, val_labels)

  self.labels = torch.tensor(labels)


In [27]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./xlnet_fake_news_classification_model",
    evaluation_strategy="steps",
    num_train_epochs=7,
    save_steps=500,
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    warmup_steps=992,
    weight_decay=0.001047076633830012,
    logging_dir="./logs",
    logging_steps=500,
)

In [28]:
# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [29]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.511,0.478753
1000,0.4598,0.403583
1500,0.4062,0.422167
2000,0.3801,0.395054
2500,0.4077,0.390713
3000,0.3529,0.486396
3500,0.3564,0.434904
4000,0.352,0.429717
4500,0.3344,0.435715
5000,0.3008,0.422435


TrainOutput(global_step=7749, training_loss=0.34039742556798536, metrics={'train_runtime': 1961.5359, 'train_samples_per_second': 63.208, 'train_steps_per_second': 3.95, 'total_flos': 4277103492106368.0, 'train_loss': 0.34039742556798536, 'epoch': 7.0})

In [30]:
# Save the model if needed
trainer.save_model("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/Models/XLNet/gossicop_nlr_7a")

In [31]:
results = trainer.evaluate(val_dataset)

In [32]:
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.4890677034854889, 'eval_runtime': 7.1711, 'eval_samples_per_second': 308.739, 'eval_steps_per_second': 38.627, 'epoch': 7.0}


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [11]:
# Load the model for inference
loaded_model = XLNetForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/fake_news/FakeNewsNet_DS/Models/XLNet/gossicop_nlr_7a")


In [35]:
# Make predictions on the validation set
with torch.no_grad():
    validation_outputs = loaded_model(**val_encodings)
    val_logits = validation_outputs.logits
    val_predicted_labels = torch.argmax(val_logits, dim=1)

In [36]:
# Calculate evaluation metrics for the validation set
accuracy_val = accuracy_score(val_labels, val_predicted_labels)
precision_val = precision_score(val_labels, val_predicted_labels)
recall_val = recall_score(val_labels, val_predicted_labels)
f1_val = f1_score(val_labels, val_predicted_labels)

In [37]:
print(f"Validation Accuracy: {accuracy_val*100:.4f}")
print(f"Validation Precision: {precision_val*100:.4f}")
print(f"Validation Recall: {recall_val*100:.4f}")
print(f"Validation F1 Score: {f1_val*100:.4f}")

Validation Accuracy: 84.5077
Validation Precision: 88.8239
Validation Recall: 90.8982
Validation F1 Score: 89.8491


In [12]:
test_texts = list(test_df['SECTION_CLEANED'])
len(test_texts)

2214

In [13]:
test_labels = list(test_df['label'])
len(test_labels)

2214

In [14]:
test_encodings = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True,max_length=128)

In [15]:
test_labels = torch.tensor(test_labels)

In [16]:
# Make predictions on the test set
with torch.no_grad():
    test_outputs = loaded_model(**test_encodings)
    test_logits = test_outputs.logits
    test_predicted_labels = torch.argmax(test_logits, dim=1)

In [17]:
# Calculate evaluation metrics for the test set
test_accuracy = accuracy_score(test_labels, test_predicted_labels)
test_precision = precision_score(test_labels, test_predicted_labels)
test_recall = recall_score(test_labels, test_predicted_labels)
test_f1 = f1_score(test_labels, test_predicted_labels)

In [18]:
print("\nTest Set Metrics:")
print(f"Test Accuracy: {test_accuracy*100:.4f}")
print(f"Test Precision: {test_precision*100:.4f}")
print(f"Test Recall: {test_recall*100:.4f}")
print(f"Test F1 Score: {test_f1*100:.4f}")


Test Set Metrics:
Test Accuracy: 85.5014
Test Precision: 89.6714
Test Recall: 91.3329
Test F1 Score: 90.4945
