In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import os
import pandas as pd 
import numpy as np
from proposed_work import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from text_prep import *
from sklearn.preprocessing import LabelEncoder

In [42]:
## Functions ## 
def classify_tweet(tweet,device):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs['input_ids'].to(device)
    attentions = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs,_ = quantized_model(input_ids, attentions)
    probabilities = torch.sigmoid(outputs)
    return probabilities.tolist()

def classify_tweets_batch(tweets, device):
    probabilities_list = []
    for tweet in tweets:
        probabilities = classify_tweet(tweet, device)
        probabilities_list.append(probabilities[0])  # Assuming each output is a list
    return probabilities_list

def classify_tweets_from_df(df, device):
    # Load the test dataset
    text_col_names = ['tweet', 'Tweet', 'text', 'Text', 'clean_text', 'Clean_text','clean_tweet','Clean_tweet']
    text_col = get_column_name(df, text_col_names)
  
    # Classify tweets
    probabilities = classify_tweets_batch(df['clean_tweet'].tolist(), device=device)
    if probabilities:
        # Using argmax to find the index of the highest probability
        df['predictions'] = [np.argmax(prob) for prob in probabilities]  # 0 for true information, 1 for misinformation
        
    num_classes = len(probabilities[0]) if probabilities else 0  # Determine number of classes
    
    # Create new columns for probabilities
    for i in range(num_classes):  # Loop through the number of classes
        df[f'probability_class_{i}'] = [prob[i] for prob in probabilities]
    return df

def compute_metrics(df, device):
    # Classify tweets from the dataframe
    df = classify_tweets_from_df(df, device)
    # Identify the label column
    label_col_names = ['label', 'target', 'Target', 'Label', 'class', 'Class']
    label_col = get_column_name(df, label_col_names)
    # Extract true labels and predictions
    y_true = df[label_col].values
    y_pred = df['predictions'].values
    
    # Encode the true labels to binary values (0 and 1)
    encoder = LabelEncoder()
    y_true_encoded = encoder.fit_transform(y_true)  # Encode to 0 and 1
    y_pred_encoded = encoder.fit_transform(y_pred)

 

    # Calculate metrics
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded)
    recall = recall_score(y_true_encoded, y_pred_encoded)
    f1 = f1_score(y_true_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_true_encoded, y_pred_encoded)
    
    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy
    
    return {
        'accuracy': accuracy ,
        'precision': precision ,
        'recall': recall,
        'f1_score': f1,
        'mcc': mcc,
        'misclassification_rate': misclassification_rate,
    }


In [54]:
input_file_path = r'D:\PHD\Research Implementation\Website\web_3\Backend\uploads\MisoVac.csv'  # Update with your input file path


# Read the CSV file into a DataFrame
df = pd.read_csv(input_file_path)
df = df.sample(250,random_state=42)
df.head()

Unnamed: 0,text,label,platform_label
1970,south africa asks serum institute take back m...,False,news
1726,president donald trump announce scientist fina...,False,twitter
527,it claimed israelis discovered vaccine coronav...,False,twitter
994,posts social media claim people vaccinated flu...,False,twitter
1124,breaking trump announced us officially leaving...,False,twitter


In [55]:
df['clean_tweet'] = process_tweets_in_chunks(df)

In [58]:
df = classify_tweets_from_df(df,device)

In [69]:
df.tail(10).text.iloc[0]

'our body fights own not need foreign  dangerous ingredients injected work stop lying'

In [74]:
df[['text','label']].tail().iloc[0]['text']

'well vaccination record private personal information not a huge database that anyone can access whenever they want. this would result in used people receiving a vaccine not wanting e.g.'

In [60]:
df.predictions.value_counts()

predictions
0    145
1    105
Name: count, dtype: int64

In [61]:
results = compute_metrics(df, device)

In [62]:
results

{'accuracy': 0.916,
 'precision': 0.9333333333333333,
 'recall': 0.875,
 'f1_score': 0.9032258064516129,
 'mcc': 0.8305049452599802,
 'misclassification_rate': 0.08399999999999996}