# Sentiment Analysis


In [4]:
import torch
import torch.nn as nn
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df = pd.read_csv('/kaggle/input/twitter-airline-sentiment/Tweets.csv')

In [7]:
def preprocess(text):
    text = re.sub(r"http\S+", "", text) # removes any url
    text = re.sub(r"@\w+", "", text) # removes any mentions like @username
    text = re.sub(r"[^a-zA-Z']", " ", text) # removes all non alphabets and apostrophe with blank
    text = text.lower()
    return word_tokenize(text) # tokenizes the text

In [8]:
df['tokens'] = df['text'].apply(preprocess)
label_map = {'negative':0, 'neutral':1, 'positive':2}
df['labels'] = df['airline_sentiment'].map(label_map)

In [9]:
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,tokens,labels
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),"[what, said]",1
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),"[plus, you, 've, added, commercials, to, the, ...",2
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),"[i, did, n't, today, must, mean, i, need, to, ...",1
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),"[it, 's, really, aggressive, to, blast, obnoxi...",0
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),"[and, it, 's, a, really, big, bad, thing, abou...",0


In [10]:
from collections import Counter # to count how many times each item appears in a list

all_tokens = [token for tokens in df['tokens'] for token in tokens] # flattens out all tokens into one big list
vocab = {word: i+2 for i, (word, _) in enumerate(Counter(all_tokens).most_common(10000))} # creates dictionary that maps 10,000 most fequent words to unique integer.
# first 2 indexes are reserved for padding and unknown words
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['input_ids'] = df['tokens'].apply(encode)

In [11]:
class TweetDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        x = self.inputs[idx]
        y = self.labels[idx]
        return torch.tensor(x), torch.tensor(y)

In [None]:
def pad_collate(batch):
    inputs, labels = zip(*batch) # batch = list of pairs (input_tensor, label) is separated into 2 tupples
    lengths = [len(x) for x in inputs]
    padded = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0) # pads all the inut sequences to the ...
    # length of the longest one by adding 0 at end
    return padded, torch.tensor(labels), torch.tensor(lengths)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(df['input_ids'], df['labels'], test_size = 0.2)

train_dataset = TweetDataset(list(X_train), list(y_train))
val_dataset = TweetDataset(list(X_val), list(y_val))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=pad_collate)


In [14]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h_n, _) = self.lstm(packed)
        out = torch.cat((h_n[-2], h_n[-1]), dim=1)
        return self.fc(out)


In [15]:
import torch.nn.functional as F

VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 3  # For 3 sentiment classes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()


In [16]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct = 0

    for x, y, lengths in dataloader:
        x, y, lengths = x.to(device), y.to(device), lengths.to(device)

        optimizer.zero_grad()
        outputs = model(x, lengths)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

In [17]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for x, y, lengths in dataloader:
            x, y, lengths = x.to(device), y.to(device), lengths.to(device)

            outputs = model(x, lengths)
            loss = criterion(outputs, y)

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy


In [23]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")


Epoch 1
Train Loss: 0.0313, Accuracy: 0.9914
Val   Loss: 1.1677, Accuracy: 0.7534
Epoch 2
Train Loss: 0.0313, Accuracy: 0.9914
Val   Loss: 1.1677, Accuracy: 0.7534
Epoch 3
Train Loss: 0.0313, Accuracy: 0.9914
Val   Loss: 1.1677, Accuracy: 0.7534
Epoch 4
Train Loss: 0.0313, Accuracy: 0.9914
Val   Loss: 1.1677, Accuracy: 0.7534
Epoch 5
Train Loss: 0.0313, Accuracy: 0.9914
Val   Loss: 1.1677, Accuracy: 0.7534


In [24]:
torch.save(model.state_dict(), "/kaggle/working/sentiment_lstm.pth")


In [25]:
model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
model.load_state_dict(torch.load("/kaggle/working/sentiment_lstm.pth"))
model.eval()


  model.load_state_dict(torch.load("/kaggle/working/sentiment_lstm.pth"))


LSTMModel(
  (embedding): Embedding(10002, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
)

In [26]:
def preprocess_and_encode(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^a-zA-Z']", " ", text).lower()
    tokens = word_tokenize(text)
    encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    return torch.tensor(encoded, dtype=torch.long)

def predict_sentiment(text):
    model.eval()
    encoded = preprocess_and_encode(text)
    length = torch.tensor([len(encoded)])

    encoded = encoded.unsqueeze(0).to(device)       # Add batch dim
    length = length.to(device)

    with torch.no_grad():
        output = model(encoded, length)
        prediction = torch.argmax(output, dim=1).item()

    reverse_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    return reverse_map[prediction]


In [29]:
text = "@United your service was AMAZING!! "
print(predict_sentiment(text))  


positive
