In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from huggingface_hub import login

import numpy as np

In [None]:

# Mount Google Drive for dataset access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define dataset paths
train_file = '/content/drive/MyDrive/dataScienceLab/train_df_preprocessed.csv'
test_file = '/content/drive/MyDrive/dataScienceLab/test_df_preprocessed.csv'
valid_file = '/content/drive/MyDrive/dataScienceLab/valid_df_preprocessed.csv'

# Load datasets into pandas DataFrames
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
valid_df = pd.read_csv(valid_file)

print(train_df.head())

         Label                                          Statement  \
0        FALSE  say annies list political group support third ...   
1    half-true  decline coal start started natural gas took st...   
2  mostly-true  hillary clinton agrees john mccain voting give...   
3        FALSE  health care reform legislation likely mandate ...   
4    half-true               economic turnaround started end term   

                              Subject         Speaker             Job Title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting               Unknown   
4                        economy,jobs   charlie-crist               Unknown   

      State       Party  Barely True Count  False Count  Half True Count  \
0     Texas  republican           

In [None]:
# Map class labels to integer indices if they are not already
label_mapping = {label: idx for idx, label in enumerate(train_df["Label"].unique())}

train_df["Label"] = train_df["Label"].map(label_mapping)
valid_df["Label"] = valid_df["Label"].map(label_mapping)
test_df["Label"] = test_df["Label"].map(label_mapping)

print(f'train_df {train_df.head()}')
print(f'valid_df {valid_df.head()}')
print(f'train_df {train_df.head()}')

train_df    Label                                          Statement  \
0      0  say annies list political group support third ...   
1      1  decline coal start started natural gas took st...   
2      2  hillary clinton agrees john mccain voting give...   
3      0  health care reform legislation likely mandate ...   
4      1               economic turnaround started end term   

                              Subject         Speaker             Job Title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting               Unknown   
4                        economy,jobs   charlie-crist               Unknown   

      State       Party  Barely True Count  False Count  Half True Count  \
0     Texas  republican                0.0          1.0      

In [None]:

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)





print(f'train_dataset {train_dataset}')
print(f'valid_dataset {valid_dataset}')
print(f'test_df {test_dataset}')



train_dataset Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment'],
    num_rows: 10263
})
valid_dataset Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment'],
    num_rows: 1284
})
test_df Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment'],
    num

In [None]:

# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
#Logs into Hugging Face to enable downloading large pre-trained models like LLaMA.
login(token="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")

In [None]:
# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # Change this to any sequence classification model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_mapping)
).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Check if the tokenizer has a pad_token, otherwise add one
if tokenizer.pad_token is None:
    # Use the eos_token as the padding token if available
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        # Add a new pad token if no eos_token exists
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.pad_token = '[PAD]'

# Update the model's embedding size if a new token is added
model.resize_token_embeddings(len(tokenizer))


# Update the model's configuration with the new pad token
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["Statement"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/10263 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

In [None]:

print(f'train_dataset {train_dataset}')
print(f'valid_dataset {valid_dataset}')
print(f'test_dataset {test_dataset}')

train_dataset Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 10263
})
valid_dataset Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 1284
})
test_dataset Dataset({
    features: ['Label', 'Statement', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Cre

In [None]:
train_dataset = train_dataset.remove_columns(["Statement"])
valid_dataset = valid_dataset.remove_columns(["Statement"])
test_dataset = test_dataset.remove_columns(["Statement"])

print(f'train_dataset {train_dataset}')
print(f'valid_dataset {valid_dataset}')
print(f'test_dataset {test_dataset}')


train_dataset Dataset({
    features: ['Label', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 10263
})
valid_dataset Dataset({
    features: ['Label', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 1284
})
test_dataset Dataset({
    features: ['Label', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False R

In [None]:
train_dataset = train_dataset.rename_column("Label", "labels")
valid_dataset = valid_dataset.rename_column("Label", "labels")
test_dataset = test_dataset.rename_column("Label", "labels")

print(f'train_dataset {train_dataset}')
print(f'valid_dataset {valid_dataset}')
print(f'test_dataset {test_dataset}')


train_dataset Dataset({
    features: ['labels', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 10263
})
valid_dataset Dataset({
    features: ['labels', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'False Ratio', 'Sentiment', 'input_ids', 'attention_mask'],
    num_rows: 1284
})
test_dataset Dataset({
    features: ['labels', 'Subject', 'Speaker', 'Job Title', 'State', 'Party', 'Barely True Count', 'False Count', 'Half True Count', 'Mostly True Count', 'Pants on Fire Count', 'Context', 'Party_Categorized', 'Adjusted Credit History', 'Label_encoded', 'Fals

In [None]:
# If you're using datasets from HuggingFace, make sure that the labels are of integer type
train_dataset = train_dataset.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.long)})
valid_dataset = valid_dataset.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.long)})
test_dataset = test_dataset.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.long)})

Map:   0%|          | 0/10263 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

In [None]:
valid_classes = ["FALSE", "half-true", "mostly-true", "TRUE", "barely-true", "pants-fire"]

In [None]:
# Define a function for computing the metrics
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)  # Assuming you're doing classification
    report = classification_report(labels, preds, target_names=valid_classes, zero_division=0)
    return {"classification_report": report}

In [None]:
from transformers import EarlyStoppingCallback


In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class",                 # Directory for model outputs
    evaluation_strategy="steps",           # Evaluate after every N steps
    eval_steps=500,                        # Frequency of evaluation
    save_strategy="steps",                 # Save checkpoint after evaluation
    save_steps=500,                        # Save steps should align with eval_steps
    save_total_limit=1,                    # Limit the number of checkpoints
    load_best_model_at_end=True,           # Load the best model based on evaluation
    metric_for_best_model="eval_loss",     # Metric to track for early stopping
    greater_is_better=False,               # Whether lower is better for the metric
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,                   # Maximum number of epochs
    logging_dir="/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/logs",                  # Directory for logs
    logging_steps=100,                      # Frequency of logging
    warmup_steps=100,                     # Number of warmup steps
    weight_decay=0.3,                     # Weight decay for optimize
)




In [None]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if no improvement for 3 evaluations
)


In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]  # Early stopping
)


  trainer = Trainer(


In [None]:

# Train and save
trainer.train()

Step,Training Loss,Validation Loss,Classification Report
500,1.6738,1.797181,precision recall f1-score support  FALSE 0.23 0.31 0.26 263  half-true 0.25 0.04 0.06 248  mostly-true 0.23 0.61 0.34 251  TRUE 0.00 0.00 0.00 169  barely-true 0.22 0.21 0.21 237  pants-fire 0.00 0.00 0.00 116  accuracy 0.23 1284  macro avg 0.15 0.19 0.15 1284 weighted avg 0.18 0.23 0.17 1284
1000,1.4381,2.005146,precision recall f1-score support  FALSE 0.26 0.44 0.32 263  half-true 0.22 0.36 0.28 248  mostly-true 0.26 0.04 0.06 251  TRUE 0.18 0.36 0.24 169  barely-true 0.20 0.02 0.04 237  pants-fire 0.37 0.09 0.15 116  accuracy 0.23 1284  macro avg 0.25 0.22 0.18 1284 weighted avg 0.24 0.23 0.18 1284
1500,1.1707,2.115619,precision recall f1-score support  FALSE 0.26 0.29 0.28 263  half-true 0.21 0.31 0.25 248  mostly-true 0.27 0.19 0.22 251  TRUE 0.17 0.27 0.21 169  barely-true 0.24 0.16 0.19 237  pants-fire 0.29 0.05 0.09 116  accuracy 0.23 1284  macro avg 0.24 0.21 0.21 1284 weighted avg 0.24 0.23 0.22 1284
2000,0.7154,3.321758,precision recall f1-score support  FALSE 0.27 0.27 0.27 263  half-true 0.21 0.23 0.22 248  mostly-true 0.27 0.21 0.23 251  TRUE 0.20 0.27 0.23 169  barely-true 0.23 0.26 0.25 237  pants-fire 0.23 0.12 0.16 116  accuracy 0.24 1284  macro avg 0.24 0.23 0.23 1284 weighted avg 0.24 0.24 0.23 1284


Could not locate the best model at /content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/checkpoint-500/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=2000, training_loss=1.4046392059326172, metrics={'train_runtime': 3368.2288, 'train_samples_per_second': 15.235, 'train_steps_per_second': 0.953, 'total_flos': 4.77923228516352e+16, 'train_loss': 1.4046392059326172, 'epoch': 3.115264797507788})

In [None]:
trainer.save_model("/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/final_model")
tokenizer.save_pretrained("/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/final_model")



('/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/dataScienceLab/fine_tuned_model_LLAMA321_class/final_model/tokenizer.json')

In [None]:
# Evaluate
results = trainer.evaluate(test_dataset)



In [None]:
#eval_classification_report

print(results)

# Print the classification report
print("Full Classification Report:\n", results["eval_classification_report"])

{'eval_loss': 3.3888893127441406, 'eval_classification_report': '              precision    recall  f1-score   support\n\n       FALSE       0.23      0.24      0.24       249\n   half-true       0.26      0.26      0.26       265\n mostly-true       0.24      0.23      0.23       241\n        TRUE       0.23      0.25      0.24       208\n barely-true       0.22      0.24      0.23       212\n  pants-fire       0.25      0.15      0.19        92\n\n    accuracy                           0.24      1267\n   macro avg       0.24      0.23      0.23      1267\nweighted avg       0.24      0.24      0.24      1267\n', 'eval_runtime': 38.0402, 'eval_samples_per_second': 33.307, 'eval_steps_per_second': 2.103, 'epoch': 3.115264797507788}
Full Classification Report:
               precision    recall  f1-score   support

       FALSE       0.23      0.24      0.24       249
   half-true       0.26      0.26      0.26       265
 mostly-true       0.24      0.23      0.23       241
        TRUE