# Sentiment Analysis Model training

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
from string import punctuation
from nltk.corpus import stopwords
print(stopwords.words('english')[10:15])

def punctuation_stopwords_removal(sms):
    # filters charecter-by-charecter : ['h', 'e', 'e', 'l', 'o', 'o', ' ', 'm', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'i', 's', ' ', 'p', 'u', 'r', 'v', 'a']
    remove_punctuation = [ch for ch in sms if ch not in punctuation]
    # convert them back to sentences and split into words
    remove_punctuation = "".join(remove_punctuation).split()
    filtered_sms = [word.lower() for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    return filtered_sms

In [None]:
sentiment_df = pd.read_csv('/kaggle/input/twitterdata/finalSentimentdata2.csv')

In [None]:
sentiment_df.loc[:, 'text'] = sentiment_df['text'].apply(punctuation_stopwords_removal)

In [None]:
reviews_split = []
for i, j in sentiment_df.iterrows():
    reviews_split.append(j['text'])

In [None]:
words = []
for review in reviews_split:
    for word in review:
        words.append(word)

## Encoding

In [None]:
from collections import Counter

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word:ii for ii, word in enumerate(vocab, 1)}

In [None]:
encoded_reviews = []
for review in reviews_split:
    encoded_reviews.append([vocab_to_int[word] for word in review])

In [None]:
print(len(vocab_to_int))
print(encoded_reviews[:10])

In [None]:
labels_to_int = []
for i, j in sentiment_df.iterrows():
    if j['sentiment']=='joy':
        labels_to_int.append(1)
    else:
        labels_to_int.append(0)

## Detecting any outlier reviews

In [None]:
reviews_len = Counter([len(x) for x in encoded_reviews])
print(max(reviews_len))

In [None]:
print(len(encoded_reviews))

In [None]:
non_zero_idx = [ii for ii, review in enumerate(encoded_reviews) if len(encoded_reviews)!=0]
encoded_reviews = [encoded_reviews[ii] for ii in non_zero_idx]
encoded_labels = np.array([labels_to_int[ii] for ii in non_zero_idx])

In [None]:
print(len(encoded_reviews))
print(len(encoded_labels))

In [None]:
def pad_features(reviews_int, seq_length):
    features = np.zeros((len(reviews_int), seq_length), dtype=int)
    for i, row in enumerate(reviews_int):
        if len(row)!=0:
            features[i, -len(row):] = np.array(row)[:seq_length]
    return features

In [None]:
seq_length = 50
padded_features= pad_features(encoded_reviews, seq_length)
print(padded_features[:2])

## Training, Testing and Validating

In [None]:
split_frac = 0.8
split_idx = int(len(padded_features)*split_frac)

training_x, remaining_x = padded_features[:split_idx], padded_features[split_idx:]
training_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## Dataloaders and Batching

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# torch.from_numpy creates a tensor data from n-d array
train_data = TensorDataset(torch.from_numpy(training_x), torch.from_numpy(training_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))

batch_size = 1

train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)
valid_loader = DataLoader(valid_data, batch_size=batch_size)

In [None]:
gpu_available = torch.cuda.is_available

if gpu_available:
    print('Training on GPU')
else:
    print('GPU not available')

## Sentiment Network with PyTorch

In [None]:
import torch.nn as nn

class CovidTweetSentimentAnalysis(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.2):
        super(CovidTweetSentimentAnalysis, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    
    def forward(self, x, hidden):
        # x : batch_size * seq_length * features
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding_layer(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        # initialize weights for lstm layer
        weights = next(self.parameters()).data
        
        if gpu_available:
            hidden = (weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                     weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weights.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                     weights.new(self.n_layers, batch_size, self.hidden_dim).zero())
        return hidden

In [None]:
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1 # either happy or sad
embedding_dim = 400
hidden_dim = 256
n_layers = 2

In [None]:
net = CovidTweetSentimentAnalysis(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

In [None]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
epochs = 4
count = 0
print_every = 100
clip = 5 
if gpu_available:
    net.cuda()

net.train()
for e in range(epochs):
    # initialize lstm's hidden layer 
    h = net.init_hidden(batch_size)
    for inputs, labels in train_loader:
        count += 1
        if gpu_available:
            inputs, labels = inputs.cuda(), labels.cuda()
        h = tuple([each.data for each in h])
        
        # training process
        net.zero_grad()
        outputs, h = net(inputs, h)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        # print average training losses
        if count % print_every == 0:
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:
                val_h = tuple([each.data for each in val_h])
                if gpu_available:
                    inputs, labels = inputs.cuda(), labels.cuda()
            outputs, val_h = net(inputs, val_h)
            val_loss = criterion(outputs.squeeze(), labels.float())
            val_losses.append(val_loss.item())
        
            net.train()

In [None]:
# torch.save(net.state_dict(), 'model.pth')

In [None]:
#net= torch.load('../input/modelaa/model.pth')

In [None]:
from string import punctuation

def tokenize_covid_tweet(tweet):
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in tweet])
    return test_ints

In [None]:
def predict_covid_sentiment(net, test_tweet, seq_length=50):
    print('Original Sentence :')
    print(test_tweet)
    
    print('\nAfter removing punctuations and stop-words :')
    test_tweet = punctuation_stopwords_removal(test_tweet)
    print(test_tweet)
    
    print('\nAfter converting pre-processed tweet to tokens :')
    tokenized_tweet = tokenize_covid_tweet(test_tweet)
    print(tokenized_tweet)
    
    print('\nAfter padding the tokens into fixed sequence lengths :')
    padded_tweet = pad_features(tokenized_tweet, 50)
    print(padded_tweet)
    
    feature_tensor = torch.from_numpy(padded_tweet)
    batch_size = feature_tensor.size(0)
    
    if gpu_available:
        feature_tensor = feature_tensor.cuda()
    
    h = net.init_hidden(batch_size)
    output, h = net(feature_tensor, h)
    
    predicted_sentiment = torch.round(output.squeeze())
    print('\n==========Predicted Sentiment==========\n')
    if predicted_sentiment == 1:
        print('Happy')
    else:
        print('Sad')
    print('\n==========Predicted Sentiment==========\n')

In [None]:
test_sad_tweet = input()
predict_covid_sentiment(net, test_sad_tweet)