In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jifsk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("C:/Users/jifsk/Downloads/DATASET.csv")

In [3]:
df.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",POSITIVE
1,Please ignore previous negative rating. This a...,POSITIVE
2,"This pop-up ""Get the best Spotify experience o...",NEGATIVE
3,Really buggy and terrible to use as of recently,NEGATIVE
4,Dear Spotify why do I get songs that I didn't ...,NEGATIVE


In [4]:
label_map = {'POSITIVE': 1, 'NEGATIVE': 0}
df['label'] = df['label'].map(label_map)
df.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",1
1,Please ignore previous negative rating. This a...,1
2,"This pop-up ""Get the best Spotify experience o...",0
3,Really buggy and terrible to use as of recently,0
4,Dear Spotify why do I get songs that I didn't ...,0


In [5]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) #removes punctuation
    text = re.sub(r'\d+','',text) #removes digits
    text = text.strip()
    return text

In [6]:
df['Review'] = df['Review'].apply(clean_text)

In [7]:
df['tokens'] = df['Review'].apply(word_tokenize)
df.head()

Unnamed: 0,Review,label,tokens
0,great music service the audio is high quality ...,1,"[great, music, service, the, audio, is, high, ..."
1,please ignore previous negative rating this ap...,1,"[please, ignore, previous, negative, rating, t..."
2,this popup get the best spotify experience on ...,0,"[this, popup, get, the, best, spotify, experie..."
3,really buggy and terrible to use as of recently,0,"[really, buggy, and, terrible, to, use, as, of..."
4,dear spotify why do i get songs that i didnt p...,0,"[dear, spotify, why, do, i, get, songs, that, ..."


In [8]:
from collections import Counter

In [9]:
all_tokens = [word for tokens in df['tokens'] for word in tokens] #flatten the df['tokens'] because when we use Counter it expects a flat iterable and not lists of lists

vocab = Counter(all_tokens) 

#look up dictionary to convert tokens into a sequence of numbers
word2idx = {word: idx+2 for idx, (word, _) in enumerate(vocab.most_common())}
word2idx["<PAD>"] = 0 #reserves 0 for padding
word2idx["<UNK>"] = 1 #resrves 1 for unknown words

In [10]:
df['encoded'] = df['tokens'].apply(lambda tokens: [word2idx.get(word, 1) for word in tokens]) 
#transforms the review/tokens into numerical input that our RNN will understand

In [11]:
df.head()

Unnamed: 0,Review,label,tokens,encoded
0,great music service the audio is high quality ...,1,"[great, music, service, the, audio, is, high, ...","[43, 10, 144, 2, 277, 9, 607, 154, 5, 2, 7, 9,..."
1,please ignore previous negative rating this ap...,1,"[please, ignore, previous, negative, rating, t...","[72, 2380, 654, 1337, 528, 14, 7, 9, 359, 43, ..."
2,this popup get the best spotify experience on ...,0,"[this, popup, get, the, best, spotify, experie...","[14, 1409, 57, 2, 61, 17, 142, 20, 228, 9, 99,..."
3,really buggy and terrible to use as of recently,0,"[really, buggy, and, terrible, to, use, as, of...","[65, 298, 5, 370, 4, 56, 63, 13, 244]"
4,dear spotify why do i get songs that i didnt p...,0,"[dear, spotify, why, do, i, get, songs, that, ...","[1938, 17, 103, 90, 3, 57, 18, 21, 3, 251, 264..."


In [12]:
#now to pad the data since it will need to be a uniform input shape
from torch.nn.utils.rnn import pad_sequence
import torch

In [13]:
encoded_seqs = [torch.tensor(seq) for seq in df['encoded']]

padded_seqs = pad_sequence(encoded_seqs, batch_first = True, padding_value = 0)

In [14]:
import torch.nn as nn

vocab_size = len(word2idx) # number of unique tokens in our vocab/df
embedding_dim = 100

embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim, padding_idx = 0)

In [15]:
embedded = embedding(padded_seqs)

In [16]:
X = df['encoded'].tolist()
y = df['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 42
)

In [17]:
print(type(X_train))           # should be <class 'list'>
print(type(X_train[0]))        # should be <class 'list'>
print(type(X_train[0][0]))     # should be <class 'int'>
print(X_train[0])              # should look like [43, 10, 144, 2, ...]

<class 'list'>
<class 'list'>
<class 'int'>
[37, 745, 11, 1524, 5, 241, 48, 113, 12, 228, 384, 4, 644, 2, 170, 13, 2, 18]


In [18]:
df = df[df['encoded'].map(len) > 0]

In [19]:
def pad_and_tensor(sequences):
    tensor_seq = [torch.tensor(seq, dtype=torch.long) for seq in sequences if len(seq) > 0]
    padded = pad_sequence(tensor_seq, batch_first = True, padding_value = 0)
    return padded

In [20]:
df.head()

Unnamed: 0,Review,label,tokens,encoded
0,great music service the audio is high quality ...,1,"[great, music, service, the, audio, is, high, ...","[43, 10, 144, 2, 277, 9, 607, 154, 5, 2, 7, 9,..."
1,please ignore previous negative rating this ap...,1,"[please, ignore, previous, negative, rating, t...","[72, 2380, 654, 1337, 528, 14, 7, 9, 359, 43, ..."
2,this popup get the best spotify experience on ...,0,"[this, popup, get, the, best, spotify, experie...","[14, 1409, 57, 2, 61, 17, 142, 20, 228, 9, 99,..."
3,really buggy and terrible to use as of recently,0,"[really, buggy, and, terrible, to, use, as, of...","[65, 298, 5, 370, 4, 56, 63, 13, 244]"
4,dear spotify why do i get songs that i didnt p...,0,"[dear, spotify, why, do, i, get, songs, that, ...","[1938, 17, 103, 90, 3, 57, 18, 21, 3, 251, 264..."


In [21]:
X_train_padded = pad_and_tensor(X_train)
X_test_padded = pad_and_tensor(X_test)
y_train_tensor = torch.tensor(y_train, dtype = torch.long)
y_test_tensor= torch.tensor(y_test,dtype = torch.long)

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [23]:
rnn = nn.RNN(
    input_size = embedding_dim,
    hidden_size = 64,
    num_layers = 3,
    batch_first = True
)

fc = nn.Linear(64,2)

embedding = embedding.to(device)
rnn = rnn.to(device)
fc = fc.to(device)

num_epochs = 10

In [24]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [25]:
print(X_train_padded.shape)   # should be (num_samples, seq_len)
print(y_train_tensor.shape)   # should be (num_samples,)

torch.Size([39525, 422])
torch.Size([39526])


In [26]:
print([len(seq) for seq in X_train].count(0))  # should be 0 ideally

1


In [27]:
filtered_X_train = []
filtered_y_train = []

for seq, label in zip(X_train, y_train):
    if len(seq) > 0:
        filtered_X_train.append(seq)
        filtered_y_train.append(label)

X_train = filtered_X_train
y_train = filtered_y_train

In [28]:
X_train_padded = pad_sequence([torch.tensor(seq) for seq in X_train], batch_first=True, padding_value=0)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

print(X_train_padded.shape)
print(y_train_tensor.shape)

torch.Size([39525, 422])
torch.Size([39525])


In [29]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(embedding.parameters()) + list(rnn.parameters()) + list(fc.parameters()), lr = 0.0001)

train_dataset = TensorDataset(X_train_padded, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)

In [30]:
print(torch.cuda.is_available())  # Should print True if CUDA GPU is accessible
print(torch.cuda.current_device())  # Device ID, usually 0 if you have 1 GPU
print(torch.cuda.get_device_name(0))  # Name of your GPU

True
0
NVIDIA GeForce RTX 3080 Ti


In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for epoch in range(num_epochs):
    rnn.train()
    total_loss = 0

    all_preds = []
    all_labels = []

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()

        embedded = embedding(X_batch)
        output, hidden = rnn(embedded)
        final_output = hidden[-1]
        logits = fc(final_output)

        loss = loss_fn(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Collect predictions and labels
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

    # Compute metrics at the end of the epoch
    train_acc = accuracy_score(all_labels, all_preds)
    train_prec = precision_score(all_labels, all_preds, zero_division=0)
    train_rec = recall_score(all_labels, all_preds, zero_division=0)
    train_f1 = f1_score(all_labels, all_preds, zero_division=0)


    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f} | "
          f"Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | "
          f"Rec: {train_rec:.4f} | F1: {train_f1:.4f}")


Epoch 1 | Loss: 848.4086 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 2 | Loss: 848.4906 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 3 | Loss: 848.3615 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 4 | Loss: 848.5347 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 5 | Loss: 848.4494 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 6 | Loss: 848.4010 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 7 | Loss: 848.4069 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 8 | Loss: 848.4509 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 9 | Loss: 848.4026 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
Epoch 10 | Loss: 848.3468 | Acc: 0.5581 | Prec: 0.0000 | Rec: 0.0000 | F1: 0.0000
