In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import os
import pandas as pd 
import numpy as np
from proposed_work import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from text_prep import *
from sklearn.preprocessing import LabelEncoder

In [42]:
## Functions ## 
def classify_tweet(tweet,device):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs['input_ids'].to(device)
    attentions = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs,_ = quantized_model(input_ids, attentions)
    probabilities = torch.sigmoid(outputs)
    return probabilities.tolist()

def classify_tweets_batch(tweets, device):
    probabilities_list = []
    for tweet in tweets:
        probabilities = classify_tweet(tweet, device)
        probabilities_list.append(probabilities[0])  # Assuming each output is a list
    return probabilities_list

def classify_tweets_from_df(df, device):
    # Load the test dataset
    text_col_names = ['tweet', 'Tweet', 'text', 'Text', 'clean_text', 'Clean_text','clean_tweet','Clean_tweet']
    text_col = get_column_name(df, text_col_names)
  
    # Classify tweets
    probabilities = classify_tweets_batch(df['clean_tweet'].tolist(), device=device)
    if probabilities:
        # Using argmax to find the index of the highest probability
        df['predictions'] = [np.argmax(prob) for prob in probabilities]  # 0 for true information, 1 for misinformation
        
    num_classes = len(probabilities[0]) if probabilities else 0  # Determine number of classes
    
    # Create new columns for probabilities
    for i in range(num_classes):  # Loop through the number of classes
        df[f'probability_class_{i}'] = [prob[i] for prob in probabilities]
    return df

def compute_metrics(df, device):
    # Classify tweets from the dataframe
    df = classify_tweets_from_df(df, device)
    # Identify the label column
    label_col_names = ['label', 'target', 'Target', 'Label', 'class', 'Class']
    label_col = get_column_name(df, label_col_names)
    # Extract true labels and predictions
    y_true = df[label_col].values
    y_pred = df['predictions'].values
    
    # Encode the true labels to binary values (0 and 1)
    encoder = LabelEncoder()
    y_true_encoded = encoder.fit_transform(y_true)  # Encode to 0 and 1
    y_pred_encoded = encoder.fit_transform(y_pred)

 

    # Calculate metrics
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded)
    recall = recall_score(y_true_encoded, y_pred_encoded)
    f1 = f1_score(y_true_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_true_encoded, y_pred_encoded)
    
    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy
    
    return {
        'accuracy': accuracy ,
        'precision': precision ,
        'recall': recall,
        'f1_score': f1,
        'mcc': mcc,
        'misclassification_rate': misclassification_rate,
    }


In [43]:
input_file_path = r'D:\PHD\Research Implementation\Website\web_3\Backend\uploads\IndianElection19TwitterData_2000.csv'  # Update with your input file path


# Read the CSV file into a DataFrame
df = pd.read_csv(input_file_path)
df = df.sample(250,random_state=42)
df.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet
1860,95907,2019-04-12 01:45:13+00:00,IAmViswanatha,@eenadulivenews #Andrapradesh #APElections2019...
353,25521,2019-04-07 14:46:55+00:00,Alankrita_Srv,Rahul Gandhi has more chance to win the electi...
1333,140197,2019-01-11 14:27:24+00:00,cheez_m,A surprising comment made by a #SamajwadiParty...
905,78109,2019-04-29 16:46:38+00:00,Mini2411Singh,Y no so-called tough ques to @MamataOfficial o...
1289,99573,2019-04-10 06:57:13+00:00,ASundarS,@SwamiGeetika come and ask the same question i...


In [44]:
df['clean_tweet'] = process_tweets_in_chunks(df)

In [45]:
df

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,clean_tweet
1860,95907,2019-04-12 01:45:13+00:00,IAmViswanatha,@eenadulivenews #Andrapradesh #APElections2019...,time fail provide even basic facility also fai...
353,25521,2019-04-07 14:46:55+00:00,Alankrita_Srv,Rahul Gandhi has more chance to win the electi...,rahul gandhi chance win election rcb win match...
1333,140197,2019-01-11 14:27:24+00:00,cheez_m,A surprising comment made by a #SamajwadiParty...,surprising comment make leader ask stop rafael...
905,78109,2019-04-29 16:46:38+00:00,Mini2411Singh,Y no so-called tough ques to @MamataOfficial o...,socalled tough ques political killing tmc goon...
1289,99573,2019-04-10 06:57:13+00:00,ASundarS,@SwamiGeetika come and ask the same question i...,come ask question south india
...,...,...,...,...,...
115,92717,2019-04-16 09:05:00+00:00,saurabh_dokania,"My vote for Corruption free India, Clean India...",vote corruption free india clean india strong ...
829,64741,2019-01-04 17:01:11+00:00,mananbhattnavy,3. And above all #Respect.\nRGs continued mean...,rg continue meaningless pitch rafale give clea...
1937,44712,2019-03-09 18:24:18+00:00,Dalbir87,"#LokSabhaElections2019\n\nGoverner of Mijoram,...",governer mijoram krajshekharan bjp candidate l...
808,136014,2019-02-06 18:51:55+00:00,ravinder08,Visiting India after 6 years totally baffled b...,visit india year totally baffle pollution leve...


In [46]:
df = classify_tweets_from_df(df,device)

In [50]:
df.Tweet.iloc[0]

'@eenadulivenews #Andrapradesh #APElections2019  this time #ElectionCommission FAILED to provide even basic facilities and also Failing #EVM &amp; #VVPAT problem, #EC would have been maintained/worked better to increase #Voting percentage. 🙏🙏\n#NarendraModi'

In [49]:
df.predictions.value_counts()

predictions
0    183
1     67
Name: count, dtype: int64

In [17]:
results = compute_metrics(df, device)

In [18]:
results

{'accuracy': 0.976,
 'precision': 0.9649122807017544,
 'recall': 0.9821428571428571,
 'f1_score': 0.9734513274336283,
 'mcc': 0.9516807303422897,
 'misclassification_rate': 0.02400000000000002}