In [8]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the file path to the train.tsv file
file_path = r"C:\projects\lauzhack-2024\liar_dataset-master\train.tsv"

# Load the tokenizer and model
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the LIAR dataset (train.tsv) into a Pandas DataFrame
columns = ['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title',
           'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
           'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
liar_data = pd.read_csv(file_path, sep='\t', header=None, names=columns)

# Extract 100 samples for verification
sample_data = liar_data.head(333)

# Preprocess the statements
def preprocess_statements(statements, tokenizer, max_length=128):
    encodings = tokenizer(statements, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings

# Prepare input data
statements = sample_data['statement'].tolist()
encodings = preprocess_statements(statements, tokenizer)

# Perform inference with the model
def get_predictions(model, encodings):
    with torch.no_grad():
        outputs = model(**encodings)
        probabilities = torch.softmax(outputs.logits, dim=1)  # Convert logits to probabilities
        predictions = torch.argmax(probabilities, dim=1)  # Get predicted class
    return predictions, probabilities

predictions, probabilities = get_predictions(model, encodings)

# Display the results (no NumPy)
results = pd.DataFrame({
    'Statement': statements,
    'Predicted_Label': [pred.item() for pred in predictions],  # Convert each tensor item to a regular Python value
    'Probabilities': [prob.tolist() for prob in probabilities]  # Convert each tensor to a list
})
print(results)



tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

                                             Statement  Predicted_Label  \
0    Says the Annies List political group supports ...                0   
1    When did the decline of coal start? It started...                0   
2    Hillary Clinton agrees with John McCain "by vo...                0   
3    Health care reform legislation is likely to ma...                0   
4    The economic turnaround started at the end of ...                0   
..                                                 ...              ...   
328  Over 40 percent of small and mid-size banks th...                0   
329  You cant check out a library book without a ph...                0   
330  Says that Starbucks took Christmas off of thei...                0   
331  Atmospheric conditions could push a footballs ...                0   
332  Hillary Clinton supports unlimited abortion on...                0   

                                   Probabilities  
0     [0.9976226687431335, 0.002377317054197192]

In [12]:
# Convert predictions to a Python list and match with the actual labels
true_labels = sample_data['label'].tolist()

# Map string labels to numerical values (if necessary, depending on model's output format)
# Example: You may need to map the labels if your model outputs class indices like 0, 1, 2...
label_mapping = {
    'true': 1,
    'mostly-true': 1,
    'half-true': 0,
    'barely-true': 0,
    'false': 0,
    'pants-fire': 0
}

# Map true labels to numerical format (for comparison with model's predictions)
true_labels_numeric = [label_mapping[label] for label in true_labels]

# Calculate accuracy
correct_predictions = sum([pred == true for pred, true in zip(predictions, true_labels_numeric)])
accuracy = correct_predictions / len(true_labels_numeric)

# Display the results
results = pd.DataFrame({
    'Statement': statements,
    'True_Label': true_labels,
    'Predicted_Label': [pred.item() for pred in predictions],  # Convert each tensor item to a regular Python value
    'Probabilities': [prob.tolist() for prob in probabilities]  # Convert each tensor to a list
})

print("Accuracy:", accuracy)
print(results)


Accuracy: tensor(0.6186)
                                             Statement   True_Label  \
0    Says the Annies List political group supports ...        false   
1    When did the decline of coal start? It started...    half-true   
2    Hillary Clinton agrees with John McCain "by vo...  mostly-true   
3    Health care reform legislation is likely to ma...        false   
4    The economic turnaround started at the end of ...    half-true   
..                                                 ...          ...   
328  Over 40 percent of small and mid-size banks th...  barely-true   
329  You cant check out a library book without a ph...  barely-true   
330  Says that Starbucks took Christmas off of thei...   pants-fire   
331  Atmospheric conditions could push a footballs ...  mostly-true   
332  Hillary Clinton supports unlimited abortion on...        false   

     Predicted_Label                                Probabilities  
0                  0   [0.9976226687431335, 0.00237731

In [None]:
# FINE TUNE ESSA
