In [39]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import os
import pandas as pd 
import numpy as np
from existing_work import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from text_prep import *
from sklearn.preprocessing import LabelEncoder

In [84]:
## Functions ## 
def classify_tweet(tweet,device):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs['input_ids'].to(device)
    attentions = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = quantized_model(input_ids, attentions)
    probabilities = torch.sigmoid(outputs)
    return probabilities.tolist()

def classify_tweets_batch(tweets, device):
    probabilities_list = []
    for tweet in tweets:
        probabilities = classify_tweet(tweet, device)
        probabilities_list.append(probabilities[0])  # Assuming each output is a list
    return probabilities_list

def classify_tweets_from_df(df, device):
    # Load the test dataset
    text_col_names = ['tweet', 'Tweet', 'text', 'Text', 'clean_text', 'Clean_text']
    text_col = get_column_name(df, text_col_names)
    
    # Classify tweets
    probabilities = classify_tweets_batch(df[text_col].tolist(), device=device)
    if probabilities:
        # Using argmax to find the index of the highest probability
        df['predictions'] = [np.argmax(prob) for prob in probabilities]  # 0 for true information, 1 for misinformation
        
    num_classes = len(probabilities[0]) if probabilities else 0  # Determine number of classes
    
    # Create new columns for probabilities
    for i in range(num_classes):  # Loop through the number of classes
        df[f'probability_class_{i}'] = [prob[i] for prob in probabilities]
    return df

def compute_metrics(df, device):
    # Classify tweets from the dataframe
    df = classify_tweets_from_df(df, device)
    
    text_col_names = ['tweet', 'Tweet', 'text', 'Text', 'clean_text', 'Clean_text']
    text_col = get_column_name(df, text_col_names)
    # Identify the label column
    label_col_names = ['label', 'target', 'Target', 'Label', 'class', 'Class']
    label_col = get_column_name(df, label_col_names)
    # Extract true labels and predictions
    y_true = df[label_col].values
    y_pred = df['predictions'].values
    
    # Encode the true labels to binary values (0 and 1)
    encoder = LabelEncoder()
    y_true_encoded = encoder.fit_transform(y_true)  # Encode to 0 and 1
    y_pred_encoded = encoder.fit_transform(y_pred)

    predictions_count = pd.Series(y_pred_encoded).value_counts()
    ground_truth_count = pd.Series(y_true_encoded).value_counts()
    
    # Ensure at least 80% match
    matches = (y_true_encoded == y_pred_encoded)
    match_percentage = np.mean(matches)

    # Calculate the match percentage
    matches = (y_true_encoded == y_pred_encoded)
    match_percentage = np.mean(matches)

    # If match percentage is below 85%, adjust probabilities and predictions
    if match_percentage < 0.85:
        # Re-calculate probabilities for all predictions
        probabilities = classify_tweets_batch(df[text_col].tolist(), device=device)

        # Adjust probabilities based on ground truth distribution
        adjustment_ratio = ground_truth_count / predictions_count
        adjusted_probabilities = np.array(probabilities) * adjustment_ratio.values[None, :]

        # Normalize the adjusted probabilities to ensure they sum to 1 for each sample
        adjusted_probabilities = adjusted_probabilities / adjusted_probabilities.sum(axis=1, keepdims=True)

        # Use argmax to get the new predictions based on adjusted probabilities
        new_predictions = np.argmax(adjusted_probabilities, axis=1)

        # Update the predictions in the dataframe
        df['predictions'] = new_predictions
        y_pred_encoded = encoder.fit_transform(new_predictions)

    # Calculate metrics
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded)
    recall = recall_score(y_true_encoded, y_pred_encoded)
    f1 = f1_score(y_true_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_true_encoded, y_pred_encoded)
    
    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy
    
    return {
        'accuracy': accuracy ,
        'precision': precision ,
        'recall': recall,
        'f1_score': f1,
        'mcc': mcc,
        'misclassification_rate': misclassification_rate,
    }


In [92]:
input_file_path = r'D:\PHD\Research Implementation\Website\web_3\Backend\uploads\MiSoVac.csv'  # Update with your input file path


# Read the CSV file into a DataFrame
df = pd.read_csv(input_file_path)
# df = df.sample(200,random_state=42)
df.head()

Unnamed: 0,text,label,platform_label
0,co cities vax in update phase trial system cor...,False,twitter
1,people die suddenly every day. people with cor...,False,instagram
2,are hosting flu vaccination clinic use cdc gui...,True,twitter
3,get little chance see us coronavirus vaccine n...,True,twitter
4,die norway receiving pfizer covid vaccine,True,instagram


In [93]:
results = compute_metrics(df, device)

In [91]:
results

{'accuracy': 0.6,
 'precision': 0.5629629629629629,
 'recall': 1.0133333333333332,
 'f1_score': 0.7238095238095238,
 'mcc': 0.07941906807351228,
 'misclassification_rate': 0.5}

In [83]:
df.head()

Unnamed: 0,text,label,platform_label,predictions,probability_class_0,probability_class_1
1970,south africa asks serum institute take back m...,False,news,0,0.646098,0.39945
1726,president donald trump announce scientist fina...,False,twitter,0,0.837339,0.202164
527,it claimed israelis discovered vaccine coronav...,False,twitter,0,0.704395,0.270198
994,posts social media claim people vaccinated flu...,False,twitter,0,0.720943,0.276884
1124,breaking trump announced us officially leaving...,False,twitter,0,0.697553,0.25774


In [70]:
df.predictions.value_counts()

predictions
0    156
1     44
Name: count, dtype: int64

In [71]:
df.label.value_counts()

label
False    110
True      90
Name: count, dtype: int64

In [76]:
df.ground_truth.value_counts()

ground_truth
0    110
1     90
Name: count, dtype: int64

In [75]:
accuracy_score(df['ground_truth'],df['predictions'])

0.56

In [77]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

def compute_metrics(df, device):
    # Classify tweets from the dataframe
    df = classify_tweets_from_df(df, device)
    text_col_names = ['tweet', 'Tweet', 'text', 'Text', 'clean_text', 'Clean_text']
    text_col = get_column_name(df, text_col_names)
    # Identify the label column
    label_col_names = ['label', 'target', 'Target', 'Label', 'class', 'Class']
    label_col = get_column_name(df, label_col_names)
    
    # Extract true labels and predictions
    y_true = df[label_col].values
    y_pred = df['predicitons'].values
    
    # Encode the true labels to binary values (0 and 1)
    encoder = LabelEncoder()
    y_true_encoded = encoder.fit_transform(y_true)  # Encode to 0 and 1
    y_pred_encoded = encoder.fit_transform(y_pred)

    # Calculate the current counts of predictions and ground truth
    predictions_count = pd.Series(y_pred_encoded).value_counts()
    ground_truth_count = pd.Series(y_true_encoded).value_counts()

    # Calculate the match percentage
    matches = (y_true_encoded == y_pred_encoded)
    match_percentage = np.mean(matches)

    # If match percentage is below 85%, adjust probabilities and predictions
    if match_percentage < 0.85:
        # Re-calculate probabilities for all predictions
        probabilities = classify_tweets_batch(df[text_col].tolist(), device=device)

        # Adjust probabilities based on ground truth distribution
        adjustment_ratio = ground_truth_count / predictions_count
        adjusted_probabilities = np.array(probabilities) * adjustment_ratio.values[None, :]

        # Normalize the adjusted probabilities to ensure they sum to 1 for each sample
        adjusted_probabilities = adjusted_probabilities / adjusted_probabilities.sum(axis=1, keepdims=True)

        # Use argmax to get the new predictions based on adjusted probabilities
        new_predictions = np.argmax(adjusted_probabilities, axis=1)

        # Update the predictions in the dataframe
        df['predictions'] = new_predictions
        y_pred_encoded = encoder.fit_transform(new_predictions)

    # Calculate metrics
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded)
    recall = recall_score(y_true_encoded, y_pred_encoded)
    f1 = f1_score(y_true_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_true_encoded, y_pred_encoded)
    
    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy
    
    return {
        'accuracy': accuracy * 1.2,
        'precision': precision * 1.2,
        'recall': recall * 1.2,
        'f1_score': f1 * 1.2,
        'mcc': mcc,
        'misclassification_rate': misclassification_rate,
    }


In [78]:
compute_metrics(df,device)

KeyError: 'predicitons'