In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/election-dataset-2023-nig-pres/training_data_test.csv
/kaggle/input/election-dataset-2023-nig-pres/training_data_updated.csv
/kaggle/input/election-dataset-2023-nig-pres/test_data.csv


In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score



In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Read the dataset from CSV
train_dataset = pd.read_csv("/kaggle/input/election-dataset-2023-nig-pres/training_data_updated.csv")

# Get the list of stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to preprocess text and create tokens
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]  # Remove stopwords and punctuation
    return tokens

# Apply preprocessing and add tokens column
train_dataset['tokens'] = train_dataset['preprocessed_text'].apply(preprocess_text)



# Load the dataset from CSV
test_dataset = pd.read_csv("/kaggle/input/election-dataset-2023-nig-pres/training_data_test.csv")

# Get the list of stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to preprocess text and create tokens
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]  # Remove stopwords and punctuation
    return tokens

# Apply preprocessing and add tokens column
test_dataset['tokens'] = test_dataset['preprocessed_text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Print the updated dataset
print(train_dataset.head())

                                   preprocessed_text candidate sentiment  \
0  user user but it won't work against obidatti t...       obi   neutral   
1  just lol …..with all the thugs drama trying to...       obi  positive   
2  i no see aduke name here o😂😂😂 , my sister chec...       obi   neutral   
3  where are my warri obedients?obidatti2023 obid...       obi  positive   
4  apc, lp, pdp and other political parties prote...   neutral   neutral   

                                              tokens  
0  [user, user, wo, n't, work, obidatti, ticket, ...  
1  [lol, …..with, thugs, drama, trying, rig, obi,...  
2  [see, aduke, name, o😂😂😂, sister, check, name, ...  
3  [warri, obedients, obidatti2023, obidatti, obi...  
4  [apc, lp, pdp, political, parties, protest, us...  


In [6]:
# Print the first few rows of the dataset
print(test_dataset.head())

                                   preprocessed_text candidate  sentiment  \
0  user user but it won't work against obidatti t...       obi        NaN   
1  just lol …..with all the thugs drama trying to...       obi        NaN   
2  i no see aduke name here o😂😂😂 , my sister chec...       obi        NaN   
3  where are my warri obedients?obidatti2023 obid...       obi        NaN   
4  apc, lp, pdp and other political parties prote...   neutral        NaN   

                                              tokens  
0  [user, user, wo, n't, work, obidatti, ticket, ...  
1  [lol, …..with, thugs, drama, trying, rig, obi,...  
2  [see, aduke, name, o😂😂😂, sister, check, name, ...  
3  [warri, obedients, obidatti2023, obidatti, obi...  
4  [apc, lp, pdp, political, parties, protest, us...  


In [7]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Instantiate VADER
vader = SentimentIntensityAnalyzer()

# Get the list of stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to preprocess text and create tokens
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]  # Remove stopwords and punctuation
    return tokens

# Function to predict sentiment using VADER
def predict_sentiment(text):
    scores = vader.polarity_scores(text)
    if scores['compound'] > 0:
        return 'positive'
    elif scores['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'

# Preprocess test data and predict sentiment using VADER
test_dataset['sentiment'] = test_dataset['tokens'].apply(predict_sentiment)

# Compare predicted sentiment with actual sentiment from training dataset
accuracy = accuracy_score(train_dataset['sentiment'], test_dataset['sentiment'])
print("Accuracy:", accuracy)


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Accuracy: 0.5207142857142857


In [8]:
test_dataset.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens
0,user user but it won't work against obidatti t...,obi,neutral,"[user, user, wo, n't, work, obidatti, ticket, ..."
1,just lol …..with all the thugs drama trying to...,obi,neutral,"[lol, …..with, thugs, drama, trying, rig, obi,..."
2,"i no see aduke name here o😂😂😂 , my sister chec...",obi,neutral,"[see, aduke, name, o😂😂😂, sister, check, name, ..."
3,where are my warri obedients?obidatti2023 obid...,obi,neutral,"[warri, obedients, obidatti2023, obidatti, obi..."
4,"apc, lp, pdp and other political parties prote...",neutral,neutral,"[apc, lp, pdp, political, parties, protest, us..."


In [9]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

In [10]:
# Load the test dataset from CSV
test_df = pd.read_csv("/kaggle/input/election-dataset-2023-nig-pres/test_data.csv")

# Apply preprocessing and add tokens column
test_df['tokens'] = test_df['preprocessed_text'].apply(preprocess_text)

In [11]:
test_df.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens
0,some people don't deserve to be called agents ...,obi,,"[people, n't, deserve, called, agents, obidatt..."
1,i just arrived at the venue of the breakfast t...,obi,,"[arrived, venue, breakfast, town, hall, meetin..."
2,user thank you very much... we the obidatti wa...,obi,,"[user, thank, much, ..., obidatti, wait, till,..."
3,what's happening over there in the north? it's...,obi,,"['s, happening, north, 's, quiet, nigeriadecid..."
4,abuja prophet don join road show for obidatti🔥...,obi,,"[abuja, prophet, join, road, show, obidatti🔥, ..."


In [12]:
# Remove leading and trailing spaces from the "sentiment" column
train_dataset['sentiment'] = train_dataset['sentiment'].str.strip()

# View unique values of the "sentiment" column
unique_sentiments = train_dataset['sentiment'].unique()
print(unique_sentiments)

['neutral' 'positive' 'negative']


In [13]:
# Count the unique number of labels in the 'sentiment' column
unique_labels_count = train_dataset['sentiment'].value_counts()

# Print the result
print("Unique Number of Labels in 'sentiment' Column:")
print(unique_labels_count)


Unique Number of Labels in 'sentiment' Column:
sentiment
neutral     2198
positive    1860
negative     142
Name: count, dtype: int64


In [14]:
# Define the training function
def train_model(model, train_dataloader, optimizer, scheduler, device, epochs):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/Training"):
            batch = tuple(t.to(device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Training Loss: {avg_train_loss:.4f}")


# Preprocess the text data (assuming 'text' column contains text data)
texts = train_dataset['preprocessed_text'].tolist()
labels = train_dataset['sentiment'].tolist()

# Map textual labels to numerical values
label_map = {"neutral": 0, "positive": 1, "negative": 2}
labels = [label_map[label] for label in labels]

# Tokenize the texts
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensors
labels = torch.tensor(labels)

# Create TensorDataset
dataset = TensorDataset(encoded_texts['input_ids'], encoded_texts['attention_mask'], labels)

# Create a DataLoader
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
model.to(device)

# Set hyperparameters
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define output directory for trained model
output_dir = "./output"

# Training arguments
training_args = {
    "output_dir": output_dir,
    "num_train_epochs": epochs,
    "per_device_train_batch_size": 32,
    "save_steps": -1,  # Save checkpoints after each epoch
    "save_total_limit": 1,  # Only keep one checkpoint
}

# Train the model
train_model(model, train_dataloader, optimizer, scheduler, device, epochs)

# Save the model
model.save_pretrained(output_dir)

# Optionally, provide feedback that the file was successfully loaded
print("Training dataset loaded successfully.")

# Optionally, provide feedback that the model was successfully saved
print(f"Model saved to {output_dir}.")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Epoch 1/Training: 100%|██████████| 132/132 [01:20<00:00,  1.63it/s]


Training Loss: 0.7128
Epoch 2/3


Epoch 2/Training: 100%|██████████| 132/132 [01:20<00:00,  1.65it/s]


Training Loss: 0.5817
Epoch 3/3


Epoch 3/Training: 100%|██████████| 132/132 [01:20<00:00,  1.64it/s]


Training Loss: 0.5250
Training dataset loaded successfully.
Model saved to ./output.


In [15]:
# Remove leading and trailing spaces from the "sentiment" column
#test_df['sentiment'] = test_df['sentiment'].str.strip()

# View unique values of the "sentiment" column
unique_sentiments = test_df['sentiment'].unique()
print(unique_sentiments)

[nan]


In [16]:
# Load pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Preprocess the text data
test_texts = test_df['preprocessed_text'].tolist()

# Define batch size
batch_size = 32

# Calculate the total number of batches
total_batches = len(test_texts) // batch_size + (len(test_texts) % batch_size != 0)

# Load the trained model
model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/output/")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Lists to store predicted sentiments and indices
predicted_sentiments = []
batch_indices = []

# Iterate through batches
for i in range(total_batches):
    # Get the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(test_texts))
    batch_texts = test_texts[start_idx:end_idx]

    # Tokenize the batch
    encoded_batch = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    encoded_batch = {key: value.to(device) for key, value in encoded_batch.items()}

    # Perform inference on the batch
    with torch.no_grad():
        outputs = model(**encoded_batch)
        logits = outputs.logits

    # Get predicted labels
    predicted_labels = torch.argmax(logits, dim=1).tolist()
    predicted_sentiments.extend(predicted_labels)
    batch_indices.extend(range(start_idx, end_idx))

# Map numerical labels back to original sentiment labels
label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
predicted_sentiments = [label_map[label] for label in predicted_sentiments]

# Add predicted sentiments to the test dataframe at the correct indices
test_df['sentiment'] = None  # Initialize the column with None values
for idx, sentiment in zip(batch_indices, predicted_sentiments):
    test_df.at[idx, 'sentiment'] = sentiment

# Save the dataframe with predicted sentiments to a new CSV file
test_df.to_csv("path_to_save_predicted_test_results.csv", index=False)

print("Predictions saved successfully.")


Predictions saved successfully.


In [17]:
# Remove leading and trailing spaces from the "sentiment" column
#test_df['sentiment'] = test_df['sentiment'].str.strip()

# View unique values of the "sentiment" column
unique_sentiments = test_df['sentiment'].unique()
print(unique_sentiments)

['positive' 'neutral']


In [18]:
test_df.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens
0,some people don't deserve to be called agents ...,obi,positive,"[people, n't, deserve, called, agents, obidatt..."
1,i just arrived at the venue of the breakfast t...,obi,neutral,"[arrived, venue, breakfast, town, hall, meetin..."
2,user thank you very much... we the obidatti wa...,obi,positive,"[user, thank, much, ..., obidatti, wait, till,..."
3,what's happening over there in the north? it's...,obi,neutral,"['s, happening, north, 's, quiet, nigeriadecid..."
4,abuja prophet don join road show for obidatti🔥...,obi,positive,"[abuja, prophet, join, road, show, obidatti🔥, ..."


In [19]:
from sklearn.metrics import accuracy_score, f1_score

# Load pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Preprocess the text data from the test dataset
#test_texts = test_dataset['preprocessed_text'].tolist()

# Define batch size
batch_size = 32

# Calculate the total number of batches
total_batches = len(test_texts) // batch_size + (len(test_texts) % batch_size != 0)

# Load the trained model
model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/output/")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Lists to store predicted sentiments
predicted_sentiments = []

# Iterate through batches
for i in range(total_batches):
    # Get the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(test_texts))
    batch_texts = test_texts[start_idx:end_idx]

    # Tokenize the batch
    encoded_batch = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    encoded_batch = {key: value.to(device) for key, value in encoded_batch.items()}

    # Perform inference on the batch
    with torch.no_grad():
        outputs = model(**encoded_batch)
        logits = outputs.logits

    # Get predicted labels
    predicted_labels = torch.argmax(logits, dim=1).tolist()
    predicted_sentiments.extend(predicted_labels)

# Map numerical labels back to original sentiment labels
label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
predicted_sentiments = [label_map[label] for label in predicted_sentiments]

# Get true sentiments from the test dataset
true_sentiments = test_df['sentiment'].tolist()

# Ensure that the lengths of true_sentiments and predicted_sentiments are the same
if len(true_sentiments) != len(predicted_sentiments):
    raise ValueError("Number of samples in true_sentiments and predicted_sentiments lists do not match.")

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_sentiments, predicted_sentiments)
f1 = f1_score(true_sentiments, predicted_sentiments, average='macro')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 1.0
F1 Score: 1.0


In [20]:
test_dataset.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens
0,user user but it won't work against obidatti t...,obi,neutral,"[user, user, wo, n't, work, obidatti, ticket, ..."
1,just lol …..with all the thugs drama trying to...,obi,neutral,"[lol, …..with, thugs, drama, trying, rig, obi,..."
2,"i no see aduke name here o😂😂😂 , my sister chec...",obi,neutral,"[see, aduke, name, o😂😂😂, sister, check, name, ..."
3,where are my warri obedients?obidatti2023 obid...,obi,neutral,"[warri, obedients, obidatti2023, obidatti, obi..."
4,"apc, lp, pdp and other political parties prote...",neutral,neutral,"[apc, lp, pdp, political, parties, protest, us..."


In [21]:
test_df.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens
0,some people don't deserve to be called agents ...,obi,positive,"[people, n't, deserve, called, agents, obidatt..."
1,i just arrived at the venue of the breakfast t...,obi,neutral,"[arrived, venue, breakfast, town, hall, meetin..."
2,user thank you very much... we the obidatti wa...,obi,positive,"[user, thank, much, ..., obidatti, wait, till,..."
3,what's happening over there in the north? it's...,obi,neutral,"['s, happening, north, 's, quiet, nigeriadecid..."
4,abuja prophet don join road show for obidatti🔥...,obi,positive,"[abuja, prophet, join, road, show, obidatti🔥, ..."


In [22]:
# Import necessary libraries
import pandas as pd
import torch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score

# Load test dataset
test_dataset = test_df
#pd.read_csv("/kaggle/input/election-dataset-2023-nig-pres/test_data.csv")

# Define batch size for processing
batch_size = 32

# Function to predict sentiment using VADER
def predict_sentiment_vader(texts):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = []
    for text in texts:
        scores = analyzer.polarity_scores(text)
        if scores['compound'] > 0:
            sentiments.append('positive')
        elif scores['compound'] < 0:
            sentiments.append('negative')
        else:
            sentiments.append('neutral')
    return sentiments

# Function to predict sentiment using RoBERTa model
def predict_sentiment_roberta(texts):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/output/")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    sentiments = []
    for text in texts:
        encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        encoded_text = {key: value.to(device) for key, value in encoded_text.items()}
        with torch.no_grad():
            outputs = model(**encoded_text)
            logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
        label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
        sentiments.append(label_map[predicted_label])
    return sentiments

# Predict sentiments using VADER in batches
vader_sentiments = []
for i in range(0, len(test_dataset), batch_size):
    batch_texts = test_dataset['preprocessed_text'].iloc[i:i+batch_size].tolist()
    vader_sentiments.extend(predict_sentiment_vader(batch_texts))

# Predict sentiments using RoBERTa in batches
roberta_sentiments = []
for i in range(0, len(test_dataset), batch_size):
    batch_texts = test_dataset['preprocessed_text'].iloc[i:i+batch_size].tolist()
    roberta_sentiments.extend(predict_sentiment_roberta(batch_texts))

# Function to ensemble predictions
def ensemble_predictions(vader_sentiment, roberta_sentiment):
    if vader_sentiment == roberta_sentiment:
        return vader_sentiment
    else:
        return 'neutral'  # Default to neutral if there's no majority

# Add vader_sentiment and roberta_sentiment columns to the test dataset
test_dataset['vader_sentiment'] = vader_sentiments
test_dataset['roberta_sentiment'] = roberta_sentiments

# Drop rows with missing values in the 'sentiment' column
test_dataset.dropna(subset=['sentiment'], inplace=True)

# Ensemble predictions
test_dataset['ensemble_sentiment'] = test_dataset.apply(lambda x: ensemble_predictions(x['vader_sentiment'], x['roberta_sentiment']), axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset['sentiment'], test_dataset['ensemble_sentiment'])
print("Ensemble Model Accuracy:", accuracy)


Ensemble Model Accuracy: 0.7227777777777777


In [23]:
test_dataset.head()

Unnamed: 0,preprocessed_text,candidate,sentiment,tokens,vader_sentiment,roberta_sentiment,ensemble_sentiment
0,some people don't deserve to be called agents ...,obi,positive,"[people, n't, deserve, called, agents, obidatt...",neutral,positive,neutral
1,i just arrived at the venue of the breakfast t...,obi,neutral,"[arrived, venue, breakfast, town, hall, meetin...",neutral,neutral,neutral
2,user thank you very much... we the obidatti wa...,obi,positive,"[user, thank, much, ..., obidatti, wait, till,...",positive,positive,positive
3,what's happening over there in the north? it's...,obi,neutral,"['s, happening, north, 's, quiet, nigeriadecid...",neutral,neutral,neutral
4,abuja prophet don join road show for obidatti🔥...,obi,positive,"[abuja, prophet, join, road, show, obidatti🔥, ...",negative,positive,neutral
