In [1]:
# Import required modules
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import pandas as pd

In [14]:
# Load the dataset
dataset = load_dataset("LYTinn/sentiment-analysis-tweet")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 4913
    })
})


In [15]:
def map_labels(example):
    label_mapping = {-1: 0, 0: 1, 1: 2}  # Map -1 to 0, 0 to 1, and 1 to 2
    example['label'] = label_mapping[example['label']]
    return example

dataset = dataset.map(map_labels)

In [4]:
# # Initialize tokenizer and model
# model_name = "roberta-base"
# tokenizer = RobertaTokenizer.from_pretrained(model_name)
# model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Assuming 3 labels (positive, negative, neutral)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Load the saved model and tokenizer
model_path = "./sentiment-roberta"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

In [18]:
# Tokenize the data
def preprocess_function(examples):
    return tokenizer(examples['tweet'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [6]:
# # Shuffle the dataset once
# shuffled_dataset = tokenized_datasets["train"].shuffle(seed=42)

# # Split into train and validation sets
# train_dataset = shuffled_dataset.select(range(8000))  # First 8000 samples for training
# valid_dataset = shuffled_dataset.select(range(8000, 10000))  # Next 2000 samples for validation


In [19]:
from datasets import Dataset

# Example: Splitting using train_test_split
train_valid_split = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_valid_split["train"]
valid_dataset = train_valid_split["test"]


In [20]:
# Define compute metrics function
def compute_metrics(eval_pred):
    # Unpack logits and labels
    logits, labels = eval_pred

    # Convert logits to predictions on CPU
    predictions = torch.argmax(torch.tensor(logits), axis=1).cpu()

    # Convert labels to CPU if they are not already
    labels = torch.tensor(labels).cpu()

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [21]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.is_available())
# Move model to the device
model.to(device)

True


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [31]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
    )

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [33]:
# Fine-tune the model from the last checkpoint
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2632,0.07224,0.983,0.983041,0.983258,0.983


TrainOutput(global_step=1000, training_loss=0.31231107711791994, metrics={'train_runtime': 413.7402, 'train_samples_per_second': 19.336, 'train_steps_per_second': 2.417, 'total_flos': 2104907341824000.0, 'train_loss': 0.31231107711791994, 'epoch': 1.0})

In [34]:
# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation Results: {results}")

Evaluation Results: {'eval_loss': 0.07224017381668091, 'eval_accuracy': 0.983, 'eval_f1': 0.9830409807283051, 'eval_precision': 0.983257554532163, 'eval_recall': 0.983, 'eval_runtime': 30.7049, 'eval_samples_per_second': 65.136, 'eval_steps_per_second': 8.142, 'epoch': 1.0}


In [35]:
# Save the model
model.save_pretrained("./sentiment-roberta")
tokenizer.save_pretrained("./sentiment-roberta")

('./sentiment-roberta\\tokenizer_config.json',
 './sentiment-roberta\\special_tokens_map.json',
 './sentiment-roberta\\vocab.json',
 './sentiment-roberta\\merges.txt',
 './sentiment-roberta\\added_tokens.json')

In [2]:
# Load the CSV file
file_path = "fulldata.csv"
df = pd.read_csv(file_path)

In [3]:
# Ensure the text column is properly named
text_column = "comment"
assert text_column in df.columns, f"'{text_column}' column not found in CSV"

In [4]:
# Load the saved model and tokenizer
model_path = "./sentiment-roberta"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set the model to evaluation mode

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [23]:
# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(torch.cuda.is_available())

True


In [6]:
# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize input and move to the correct device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    # Ensure model is on the same device as inputs
    model.to(device)
    
    # Predict without gradient calculation
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class


In [7]:
# Small Subset Debugging: Processes a few rows to confirm the function works correctly before applying it to the entire dataset.
sample_texts = df[text_column].head(10).tolist()  # Get the first 10 rows
predictions = [predict_sentiment(text) for text in sample_texts]
print(predictions)

[1, 0, 1, 0, 1, 1, 2, 1, 0, 0]


In [20]:
import time

# Define a mapping for predicted labels to sentiment for a better understanding
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Start timing the process
start_time = time.time()

# Get random 500 rows
sample_texts = df[text_column].sample(n=100, random_state=40).tolist()

# Predict sentiment for each text
predictions = [predict_sentiment(text) for text in sample_texts]

# Print both the text and its predicted sentiment label
for text, pred in zip(sample_texts, predictions):
    sentiment = label_mapping.get(pred, "Unknown Yet")
    print(f"Text: {text} \nPredicted Sentiment: {sentiment}\n")

# End timing the process
end_time = time.time()

# Calculate and print the processing time
processing_time = end_time - start_time
print(f"Total Processing Time: {processing_time:.2f} seconds")


Text: What is that suit fit my Mamba. Miss you man. 
Predicted Sentiment: Positive

Text: Fuck I want to be in New Zealand, America kind of sucks right now in every way 
Predicted Sentiment: Negative

Text: How did you get the eggs so perfect?? 
Predicted Sentiment: Positive

Text: Momo Jenga 
Predicted Sentiment: Neutral

Text: This is surprisingly wholesome 
Predicted Sentiment: Positive

Text: I can't wait to see CS:GO gameplay on toilets 
Predicted Sentiment: Positive

Text: I love my Lakers. 
Predicted Sentiment: Positive

Text: /r/AccidentalRenaissance  
Predicted Sentiment: Neutral

Text: Where’s the brew? 
Predicted Sentiment: Neutral

Text: omg so aesthetically pleasing to look at, wondering what was in those dessert cups... 
Predicted Sentiment: Positive

Text: Now where's that video of the japanese guy with a really big and pillow-y enter key 
Predicted Sentiment: Negative

Text: The people that need to hear this message won’t listen and the people that need others to hear i

In [36]:
# Preprocessing: Remove Rows with Missing or Invalid Data
# Use this for the datasets which we didn't do fine-tuning on it.

df = df.dropna(subset=[text_column])
df[text_column] = df[text_column].astype(str)

In [37]:
# Apply the prediction function to the text column
df["predicted_label"] = df[text_column].apply(predict_sentiment)

In [38]:
# Save the results
output_file = "sentiment_predictions.csv"
df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")

Predictions saved to sentiment_predictions.csv
