In [66]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import spacy
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [67]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/mdismielhossenabir/sentiment-analysis")

Skipping, found downloaded files in "./sentiment-analysis" (use force=True to force download)


In [68]:
data = pd.read_csv('sentiment-analysis/sentiment_analysis.csv')

In [69]:
data.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [70]:
df = data[['text', 'sentiment']]


In [71]:
data_lowercase = df.copy()
data_lowercase=data_lowercase[['text', 'sentiment']].apply(lambda x: x.str.lower())
print(data_lowercase.head())

                                                text sentiment
0              what a great day!!! looks like dream.  positive
1     i feel sorry, i miss you here in the sea beach  positive
2                                     don't angry me  negative
3  we attend in the class just for listening teac...  negative
4                  those who want to go, let them go  negative


In [72]:
def clean_data(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [73]:
data_lowercase['text'] = data_lowercase['text'].apply(clean_data)
print(data_lowercase.head())

                                                text sentiment
0                  what a great day looks like dream  positive
1      i feel sorry i miss you here in the sea beach  positive
2                                      dont angry me  negative
3  we attend in the class just for listening teac...  negative
4                   those who want to go let them go  negative


In [87]:
#creating vocabulary 
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    return [token.text for token in nlp(text)]


text_tokens = (data_lowercase['text'].apply(tokenize)).to_list()
sentiment_tokens = (data_lowercase['sentiment'].apply(tokenize)).to_list()
all_text = [token for tokens in text_tokens for token in tokens]
all_sentiments = [token for tokens in sentiment_tokens for token in tokens]



In [99]:
word_counts = Counter(all_text)
sentiment_counts = Counter(all_sentiments)

word2idx = {"<PAD>": 0, "<UNK>": 1}
sentiment2idx = {"<PAD>": 0, "<UNK>": 1}
min_freq = 1
idx = len(word2idx)
for word, count in word_counts.items():
    if count >= min_freq:
        word2idx[word] = idx 
        idx += 1
   
for sentiment, count in sentiment_counts.items():
    if count >= min_freq:
        sentiment2idx[sentiment] = idx 
        idx += 1

idx2word = {idx: word for word, idx in word2idx.items()} 
idx2sentiment = {idx: sentiment for sentiment, idx in sentiment2idx.items()}

In [101]:
print(word2idx)
print(sentiment2idx)
print(idx2word)
print(idx2sentiment)
vocab_size = len(word2idx)
sentiment_size = len(sentiment2idx)
print(f"Vocabulary Size: {vocab_size}")
print(f"Sentiment Size: {sentiment_size}")

{'<PAD>': 0, '<UNK>': 1, 'what': 2, 'a': 3, 'great': 4, 'day': 5, 'looks': 6, 'like': 7, 'dream': 8, 'i': 9, 'feel': 10, 'sorry': 11, 'miss': 12, 'you': 13, 'here': 14, 'in': 15, 'the': 16, 'sea': 17, 'beach': 18, 'do': 19, 'nt': 20, 'angry': 21, 'me': 22, 'we': 23, 'attend': 24, 'class': 25, 'just': 26, 'for': 27, 'listening': 28, 'teachers': 29, 'reading': 30, 'on': 31, 'slide': 32, 'nonsence': 33, 'those': 34, 'who': 35, 'want': 36, 'to': 37, 'go': 38, 'let': 39, 'them': 40, 'its': 41, 'night': 42, 'am': 43, 'feeling': 44, 'neutral': 45, 'feedings': 46, 'baby': 47, 'are': 48, 'fun': 49, 'when': 50, 'he': 51, 'is': 52, 'all': 53, 'smiles': 54, 'and': 55, 'coos': 56, 'soooo': 57, 'high': 58, 'both': 59, 'of': 60, 'today': 61, 'first': 62, 'time': 63, 'arrive': 64, 'boat': 65, 'amazing': 66, 'journey': 67, 'love': 68, 'something': 69, 'emc': 70, 'rules': 71, 'really': 72, 'song': 73, 'cardigan': 74, 'by': 75, 'taylor': 76, 'swift': 77, 'my': 78, 'sharpie': 79, 'running': 80, 'dangerous