<a href="https://colab.research.google.com/github/IoanRoume/SentimentAnalysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U torchtext==0.15.2

# **1st DataSet (Tweets)**

## Download Dataset

In [None]:
import kagglehub

path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)



Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


In [None]:
!ls /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2

twitter_training.csv  twitter_validation.csv


## Preprocess Data

In [None]:
import pandas as pd
import re
columns = ["id","entity","sentiment","tweet"]
df_train = pd.read_csv("/root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2/twitter_training.csv", header=None,names=columns)

In [None]:
df_train['sentiment'] = df_train['sentiment'].replace("Irrelevant", "Neutral")
df_train = df_train.dropna(subset=['tweet'])
df_train = df_train.drop(columns=['id'])

def clean_text(text):
  text = text.lower()
  text = re.sub(r"http\S+|www\S+", "", text)
  text = re.sub(r"@\w+", "", text)
  text = re.sub(r"[^a-zA-Z\s]", "", text)
  text = re.sub(r"\s+", " ", text).strip()
  text = re.sub(r"[^\w\s]", "", text)
  return text

df_train['tweet'] = df_train['tweet'].apply(clean_text)

## Tokenize and Numerize Tweets

In [None]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence


tokenizer = get_tokenizer("basic_english")


def yield_tokens(data):
  for text in data:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df_train['tweet']), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

def numericalize_text(text):
  return vocab(tokenizer(text))

df_train['tweet_numerical'] = df_train['tweet'].apply(numericalize_text)

print(df_train[['tweet', 'tweet_numerical']].head())
print(f"Vocabulary size: {len(vocab)}")


def pad_tweets(tweets, max_length=50):
    return pad_sequence([torch.tensor(tweet[:max_length]) for tweet in tweets],
                        batch_first=True, padding_value=vocab["<pad>"])

train_padded = pad_tweets(df_train['tweet_numerical'].tolist())
print(f"Padded tweet shape: {train_padded.shape}")


                                               tweet  \
0  im getting on borderlands and i will murder yo...   
1  i am coming to the borders and i will kill you...   
2  im getting on borderlands and i will kill you all   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands and i will murder yo...   

                                   tweet_numerical  
0       [31, 157, 14, 138, 5, 3, 50, 1640, 13, 27]  
1  [3, 99, 368, 4, 2, 6272, 5, 3, 50, 400, 13, 27]  
2        [31, 157, 14, 138, 5, 3, 50, 400, 13, 27]  
3       [31, 368, 14, 138, 5, 3, 50, 1640, 13, 27]  
4   [31, 157, 14, 138, 5, 3, 50, 1640, 13, 20, 27]  
Vocabulary size: 37369
Padded tweet shape: torch.Size([73996, 50])


## Create Datasets and DataLoaders for Train, Test, Validation

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

class SentimentDataset(Dataset):
  def __init__(self,tweets,labels):
    self.tweets = tweets
    self.labels = labels
  def __len__(self):
    return len(self.tweets)
  def __getitem__(self,index):
    return self.tweets[index], self.labels[index]

label_mapping = {"Positive": 0, "Neutral": 1, "Negative": 2}
train_labels = df_train['sentiment'].map(label_mapping).tolist()

dataset = SentimentDataset(train_padded, train_labels)

train_size = int(len(dataset) * 0.8)
val_size = int(len(dataset) * 0.1)
test_size = len(dataset) - train_size - val_size


train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Create LSTM Model

In [None]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1):
    super(SentimentLSTM,self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(0.5)

  def forward(self,x):
    x = self.embedding(x)
    x,_ = self.lstm(x)
    x = self.dropout(x[:, -1, :])
    x = self.linear(x)
    return x

## Initialize Model, Loss function, otimizer

In [None]:
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = 3
device = "cuda" if torch.cuda.is_available() else "cpu"


model = SentimentLSTM(vocab_size,embed_dim,hidden_dim,output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)


## Train and Evaluate

In [None]:
import numpy as np
def evaluate_model(model, loader, dataset):
  model.eval()
  correct, total = 0, 0
  totalLoss = 0
  with torch.no_grad():
    for tweets, labels in loader:
      tweets = tweets.to(device)
      labels = labels.to(device)
      outputs = model(tweets)
      loss = criterion(outputs,labels)
      totalLoss += loss.item()

      _, predicted = torch.max(outputs,1)
      correct += (predicted == labels).sum().item()
      total += labels.size(0)
    totalLoss = totalLoss/len(dataset)

    accuracy = correct / total
    print(f"Accuracy: {100 * accuracy:.2f}% , Loss: {totalLoss:.4f}")
    return totalLoss, accuracy


def train_model(model, train_loader, criterion, optimizer, epochs = 5):
  validation_stop_threshold = 0.5
  validation_stop_counter = 0
  for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, (tweet, label) in enumerate(train_loader):
      tweet = tweet.to(device)
      label = label.to(device)
      optimizer.zero_grad()
      outputs = model(tweet)

      loss = criterion(outputs, label)
      loss.backward()
      optimizer.step()

      total_loss += loss.item()
    total_loss = total_loss/len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")
    val_loss, _ = evaluate_model(model,val_loader, val_dataset)
    if np.abs(val_loss - total_loss) > validation_stop_threshold:
      validation_stop_counter += 1
    else:
      validation_stop_counter = 0

    if validation_stop_counter >= 5:
      print("Early Stopping Triggered.")
      break





train_model(model, train_loader, criterion, optimizer, epochs=25)
evaluate_model(model,test_loader, test_dataset)
torch.cuda.empty_cache()

Epoch 1/25, Loss: 0.0336
Accuracy: 52.07% , Loss: 0.0314
Epoch 2/25, Loss: 0.0276
Accuracy: 66.40% , Loss: 0.0239
Epoch 3/25, Loss: 0.0201
Accuracy: 73.56% , Loss: 0.0203
Epoch 4/25, Loss: 0.0140
Accuracy: 79.04% , Loss: 0.0176
Epoch 5/25, Loss: 0.0099
Accuracy: 82.08% , Loss: 0.0158
Epoch 6/25, Loss: 0.0072
Accuracy: 83.07% , Loss: 0.0163
Epoch 7/25, Loss: 0.0054
Accuracy: 84.28% , Loss: 0.0164
Epoch 8/25, Loss: 0.0044
Accuracy: 84.19% , Loss: 0.0171
Epoch 9/25, Loss: 0.0037
Accuracy: 84.96% , Loss: 0.0166
Epoch 10/25, Loss: 0.0032
Accuracy: 84.97% , Loss: 0.0182
Epoch 11/25, Loss: 0.0028
Accuracy: 85.53% , Loss: 0.0189
Epoch 12/25, Loss: 0.0028
Accuracy: 85.77% , Loss: 0.0188
Epoch 13/25, Loss: 0.0024
Accuracy: 85.74% , Loss: 0.0184
Epoch 14/25, Loss: 0.0023
Accuracy: 86.11% , Loss: 0.0203
Epoch 15/25, Loss: 0.0022
Accuracy: 85.57% , Loss: 0.0205
Epoch 16/25, Loss: 0.0022
Accuracy: 86.32% , Loss: 0.0204
Epoch 17/25, Loss: 0.0020
Accuracy: 86.46% , Loss: 0.0196
Epoch 18/25, Loss: 0.00

# **2nd Dataset (Mental Health)**

In [None]:
import kagglehub


path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/suchintikasarkar/sentiment-analysis-for-mental-health?dataset_version_number=1...


100%|██████████| 11.1M/11.1M [00:00<00:00, 93.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [None]:
!ls /root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1

'Combined Data.csv'


## Preprocess Data

In [None]:
import pandas as pd

data = pd.read_csv("/root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1/Combined Data.csv")
data = data.drop(columns=['Unnamed: 0'])
data = data.dropna(subset="statement")

In [None]:
data['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,16343
Depression,15404
Suicidal,10652
Anxiety,3841
Bipolar,2777
Stress,2587
Personality disorder,1077


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  #Remove URLs
    text = re.sub(r'\W', ' ', text)  #Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  #Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  #Remove stopwords
    return text

data['statement'] = data['statement'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenize and Numerize Tweets

In [None]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data):
  for text in data:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(data['statement']), specials=["<unk>","<pad>"] )
vocab.set_default_index(vocab["<unk>"])


def make_text_into_numbers(text):
  return vocab(tokenizer(text))

data["numerized_statement"] = data["statement"].apply(make_text_into_numbers)

def pad_statements(states, max_length=100):
    return pad_sequence([torch.tensor(state[:max_length]) for state in states],
                        batch_first=True, padding_value=vocab["<pad>"])
padded_statement = pad_statements(data["numerized_statement"].tolist()).to(device)

## Create Datasets and DataLoaders For Train, Test, Validation

In [None]:
from torch.utils.data import DataLoader, Dataset, random_split


class MentalDataset(Dataset):
  def __init__(self,statements,classes):
    self.statements = statements
    self.classes = classes

  def __getitem__(self,index):
    return torch.tensor(self.statements[index]), torch.tensor(self.classes[index])
  def __len__(self):
    return len(self.statements)

mapped_classes = {"Normal": 0, "Depression": 1, "Suicidal": 2, "Anxiety": 3, "Stress": 4, "Bipolar": 5, "Personality disorder": 6}

classes = data['status'].map(mapped_classes).tolist()

dataset = MentalDataset(padded_statement, classes)
length = len(dataset)

train_size = int(0.8 * length)
test_size = int(0.1 * length)
val_size = length - train_size - test_size

train_dataset, test_dataset, val_dataset = random_split(dataset,[train_size, test_size, val_size])

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle=False)
val_loader = DataLoader(test_dataset, batch_size = 32, shuffle=False)


## Create BI-LSTM Model

In [None]:
import torch.nn as nn
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm(x)
        x = self.layer_norm(x[:, -1, :])
        x = self.dropout(x)
        return self.fc(x)

## Initialize Functions

In [None]:
import numpy as np
def validate(model,criterion,val_loader):
  model.eval()
  with torch.no_grad():
    totalLoss = 0
    correct = 0
    total = 0
    for statements, classes in val_loader:
      statements = statements.to(device)
      classes = classes.to(device)
      outputs = model(statements)
      loss = criterion(outputs,classes)
      totalLoss += loss.item()

      _,prediction = torch.max(outputs,1)
      correct += (prediction == classes).sum().item()
      total += classes.shape[0]
    avg_loss = totalLoss/ len(val_loader)
    acc = correct/total
    return avg_loss, acc



def train_model(model,optimizer,criterion,train_loader, epochs=5):
  earlyStoppingThreshold = 0.5
  earlyStoppingCounter = 0
  for epoch in range(epochs):
    model.train()
    totalLoss = 0
    total = 0
    correct = 0
    for statements, classes in train_loader:
      optimizer.zero_grad()
      statements = statements.to(device)
      classes = classes.to(device)
      outputs = model(statements)
      loss = criterion(outputs,classes)
      loss.backward()
      optimizer.step()

      totalLoss += loss.item()

      _, predicted = torch.max(outputs,1)
      correct += (predicted == classes).sum().item()
      total += classes.shape[0]

    avg_loss_train = totalLoss / len(train_loader)
    train_acc = correct/total
    avg_loss_val, val_acc = validate(model,criterion,val_loader)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss_train:.4f}, Training Accuracy: {train_acc:.4f}, Validation Loss: {avg_loss_val:.4f}, Validation Accuracy: {val_acc:.4f}")

    if np.abs(avg_loss_val - avg_loss_train) > earlyStoppingThreshold:
      earlyStoppingCounter += 1
    else:
      earlyStoppingCounter = 0

    if earlyStoppingCounter >= 5:
      print("Early Stopping Triggered")
      break


def evaluate_model(model,test_loader):
  model.eval()
  with torch.no_grad():
    total = 0
    correct = 0
    for statements, classes in test_loader:
      statements = statements.to(device)
      classes = classes.to(device)

      outputs = model(statements)

      _,predictions = torch.max(outputs,1)
      correct +=(predictions == classes).sum().item()
      total += classes.shape[0]

    acc = correct/total
    print(f"\n\nTest Accuracy: {acc:.4f}")


## Initialize Model, Loss function ,Optimizer and Train/Evaluate

In [None]:

model = BiLSTM(len(vocab), 64, 64, 7).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr =0.001)

train_model(model,optimizer,criterion,train_loader,25)
evaluate_model(model,test_loader)

  return torch.tensor(self.statements[index]), torch.tensor(self.classes[index])


Epoch 1/25, Training Loss: 1.4170, Training Accuracy: 0.4685, Validation Loss: 1.2215, Validation Accuracy: 0.5499
Epoch 2/25, Training Loss: 1.1163, Training Accuracy: 0.5686, Validation Loss: 0.9577, Validation Accuracy: 0.6323
Epoch 3/25, Training Loss: 0.8992, Training Accuracy: 0.6485, Validation Loss: 0.8792, Validation Accuracy: 0.6688
Epoch 4/25, Training Loss: 0.7853, Training Accuracy: 0.6975, Validation Loss: 0.8304, Validation Accuracy: 0.6887
Epoch 5/25, Training Loss: 0.6749, Training Accuracy: 0.7453, Validation Loss: 0.7789, Validation Accuracy: 0.7065
Epoch 6/25, Training Loss: 0.5940, Training Accuracy: 0.7769, Validation Loss: 0.8063, Validation Accuracy: 0.7151
Epoch 7/25, Training Loss: 0.5203, Training Accuracy: 0.8074, Validation Loss: 0.8367, Validation Accuracy: 0.7174
Epoch 8/25, Training Loss: 0.4594, Training Accuracy: 0.8335, Validation Loss: 0.8579, Validation Accuracy: 0.7117
Epoch 9/25, Training Loss: 0.4017, Training Accuracy: 0.8582, Validation Loss: 0

## Test Trained Model with custom text

In [None]:
def preprocess_text(text, vocab, tokenizer, max_length = 100):
  text = clean_text(text)
  tokens = tokenizer(text)
  numericalized = vocab(tokens)
  tensor_input = torch.tensor(numericalized).unsqueeze(0)
  padded_input = torch.nn.functional.pad(tensor_input, (0, max_length - tensor_input.shape[1]), value=vocab["<pad>"])
  return padded_input.to(device)


def predict_text(model, text, vocab, tokenizer):
    model.eval()
    with torch.no_grad():
        processed_text = preprocess_text(text, vocab, tokenizer)
        output = model(processed_text)
        predicted_class = torch.argmax(output, dim=1).item()
    return predicted_class

class_names = {0: "Normal", 1: "Depression", 2: "Suicidal", 3: "Anxiety", 4: "Stress", 5: "Bipolar", 6: "Personality disorder"}

texts = [
    "I am feeling very anxious and stressed about work.",
    "I feel empty and numb. Nothing excites me anymore.",
    "I am feeling great and excited about the future.",
    "I don't see a reason to continue living, I just wanna die.",
    "I act impulsively and regret it later."

]


for text in texts:
  preddiction = predict_text(model,text,vocab,tokenizer)
  result = class_names[preddiction]
  print(f'{text}: {result}')





I am feeling very anxious and stressed about work.: Anxiety
I feel empty and numb. Nothing excites me anymore.: Suicidal
I am feeling great and excited about the future.: Normal
I don't see a reason to continue living, I just wanna die.: Suicidal
I act impulsively and regret it later.: Normal
