# Tweet Sentiment Analysis

1. For this demo we tries to do sentiment polarization analysis on Twitter data.

2. There is no label in the collected dataset, so we use a publically available labeled Twitter dataset, Sentiment140 dataset, as the training data, and trained a sentiment classification model. Sentiment140 dataset can be found here: https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis . 

3. We performed some preprocessing on part of this dataset, and get 

    1) "train.csv" for training

    2) "test.csv" for testing

    3) An unlabeled "prediction.csv" (this is part of our collected model) for sentiment prediction. 

4. All the files needed to run this notebook can be downloaded from https://drive.google.com/drive/folders/1eDYECK9UnDqhuy7KkA6ISK4w8mD18GdQ?usp=sharing


5. To run this demo you need to install torchtext: 
> ```pip3 install twint```
https://github.com/bentrevett/pytorch-sentiment-analysis

6. This demo uses part of the code provide by torchtext from here: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb


In [1]:
import time
import random
import pickle
from Tweet import Tweet
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import warnings
warnings.filterwarnings('ignore')

# 1 Sentiment Model Definition

These following codes defined a torch embedding model for sentiment analysis

In [2]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        # text = [sent len, batch size]
        embedded = self.embedding(text)

        # embedded = [sent len, batch size, emb dim]
        embedded = embedded.permute(1, 0, 2)

        # embedded = [batch size, sent len, emb dim]
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)

        # pooled = [batch size, embedding_dim]
        return self.fc(pooled)

# 2 Define the Training, Testing and Predicting Function

In [3]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    #convert into float for division
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [4]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



def predict(model, iterator):
    model.eval()

    senti_prediction = []
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            pred = torch.round(torch.sigmoid(predictions))
            senti_prediction.extend(list(pred.cpu().numpy()))
            
    return senti_prediction

# 3 Setting Parameters  and Features


In [5]:
# set model parameter
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# change this to your own path
data_folder = './sentAnalysis_data'

# train data path
train_path = 'train.csv'

# test data path
test_path = 'test.csv'

# setting data path for prediction csv file
predict_path = 'prediction.csv'
MAX_VOCAB_SIZE = 25_000
BATCH_SIZE = 64      
N_EPOCHS = 5


# 4 Train and Save Model

In [6]:

def train_model():

    # define the text and label field for model training
    TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
    LABEL = data.LabelField(dtype=torch.float)

    # loading training and testing dataset from train_path and test_path
    train_data, test_data = Tweet.splits(TEXT, LABEL, path=data_folder, train = train_path, test= test_path)  
    train_data, valid_data = train_data.split(random_state=random.seed(SEED)) 
    
    # build a vocabulary with training data
    MAX_VOCAB_SIZE = 25_000
    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors="glove.6B.100d",  
                     unk_init=torch.Tensor.normal_)

    LABEL.build_vocab(train_data)

    # define iterators for train, test and validation
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device)

    # decide model dimension
    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    OUTPUT_DIM = 1
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
    
    # define the model
    model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    
    # load pretrain word emeddigs
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    # initialize look-up table
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    # optimizer, criterion
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()

    # set model to CPU or GPU version
    model = model.to(device)
    criterion = criterion.to(device)

    
    # start model training
    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):

        start_time = time.time()
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'Senti-model.pt')

        print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

    
    # save the data.dataset class object TEXT for prediction
    output_hal = open("TEXT.pkl", 'wb')
    str = pickle.dumps(TEXT)
    output_hal.write(str)
    output_hal.close()

    # save the data.dataset class object TEXT for prediction
    output_hal = open("LABEL.pkl", 'wb')
    str = pickle.dumps(LABEL)
    output_hal.write(str)
    output_hal.close()

# 5 Test model  accuracy on testing data

In [7]:

def test_model():
    model.load_state_dict(torch.load('Senti-model.pt'))
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')


# 6 Sentiment Classification on Unlabeled Data

In [8]:
def Sentiment_classification(data_folder = '', predict_path = ''):
    
    # define and load class TEXT
    TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
    with open("TEXT.pkl", 'rb') as file:
        TEXT = pickle.loads(file.read())

        
    # define and load class LABEL
    LABEL = data.LabelField(dtype=torch.float)
    with open("LABEL.pkl", 'rb') as file:
        LABEL = pickle.loads(file.read())

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    OUTPUT_DIM = 1
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    # define and load the saved model
    model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
    model.load_state_dict(torch.load('Senti-model.pt'))


    # set predict=true, then then Tweet.splits would set train=None and test = predict_path
    # this function return a tuple (test_data, ), so when only predict data is needed
    # you need to take the only element out by calling predict_data = predict_data[0]
    predict_data = Tweet.splits(TEXT, LABEL, path=data_folder, predict = True, predict_path = predict_path)
    predict_data = predict_data[0]


    # here if you only give (predict data) as perameter, the initilization of
    # bucketIterator will produce some mistakes.
    # I guess that is why the original "predict_sentiment" function is not writen
    # in a parallel way.
    # Check the original code to understand this call
    predict_iterator,predict_data = data.BucketIterator.splits(
        (predict_data,predict_data),
        batch_size=BATCH_SIZE,
        device=device)

    model = model.to(device)

    prediction = predict(model, predict_iterator)
    print('Sentiment classification finished')
    return prediction

#  7 Call the train, test and Sentiment_classification function to see the results

In [10]:
# set need_Train = True if this is the first time to run this function
# set need_Train = False for sentiment classification

# It will download some corpus if it is the first time to run this model

need_Train = False

if need_Train:
    train_model()
    test_model()
else:
    Sentiment_classification(data_folder = './sentAnalysis_data', predict_path = 'prediction.csv')
print('Done!')

Sentiment classification finished
Done!


# 8 Perform Sentiment Classification on Tweets in the Same Topic and Visualization

In this part we perform sentiment classification on tweets that belong to the same topic detected in "Tweet_Topic_Modeling.ipynb"

We latter visulize positive and negative tweets repectively as word clouds to see whether sentiment polarization has relationshio with the tweet words.

In [14]:
from pyecharts import options as opts
from pyecharts.globals import SymbolType

import numpy as np
import pandas as pd

from nltk import FreqDist, word_tokenize
from pyecharts.charts import WordCloud
import warnings
warnings.filterwarnings('ignore')

def count_word_freq(sent):
    fdist = FreqDist(word.lower() for word in word_tokenize(sent))
    return fdist

def get_word_freq(tweets):
    fdist = FreqDist()
    for s in tweets:
        fdist += count_word_freq(s)
    return fdist
def visualize(fdist, k_topic, polar):
    num_words = 30
    words = fdist.most_common(num_words)
    shapes = [SymbolType.DIAMOND, SymbolType.RECT, SymbolType.ROUND_RECT, SymbolType.ARROW, SymbolType.TRIANGLE]

    w = (WordCloud().add("", words, word_size_range=[20, 100], shape=shapes[0]).set_global_opts(
        title_opts=opts.TitleOpts(title='Topic_{}_{}'.format(k_topic, polar))).render(
        './word_clouds/Topic_{}_{}.html'.format(k_topic, polar)))


for i in range(10):
    filename = './Topic_tweets/topic_{}.csv'.format(int(i))

    df_data = pd.read_csv(filename)
    res = Sentiment_classification(data_folder='./', predict_path=filename)
    print('Topic %d: Percentage of negative user is %f, positive is %f' %(i, res.count(0)/len(res), res.count(1)/len(res)))
    df_res = pd.DataFrame(np.array(res).reshape(-1, 1), columns=['polar'])
    df_res['polar'] = df_res['polar'].replace({0: 'neg', 1:'pos'})
    df = pd.concat([df_data, df_res], axis=1)

    groups = df.groupby(['polar'])
    for group in groups:
        polar = group[0]
        df = group[1]
        df = df['tweet'].dropna().tolist()
        fdist = get_word_freq(df)
        visualize(fdist,i,polar)
print('saved word cloud to ./word_clouds')

Sentiment classification finished
Topic 0: Percentage of negative user is 0.562992, positive is 0.437008
Sentiment classification finished
Topic 1: Percentage of negative user is 0.562347, positive is 0.437653
Sentiment classification finished
Topic 2: Percentage of negative user is 0.532646, positive is 0.467354
Sentiment classification finished
Topic 3: Percentage of negative user is 0.549593, positive is 0.450407
Sentiment classification finished
Topic 4: Percentage of negative user is 0.573134, positive is 0.426866
Sentiment classification finished
Topic 5: Percentage of negative user is 0.578680, positive is 0.421320
Sentiment classification finished
Topic 6: Percentage of negative user is 0.614815, positive is 0.385185
Sentiment classification finished
Topic 7: Percentage of negative user is 0.522917, positive is 0.477083
Sentiment classification finished
Topic 8: Percentage of negative user is 0.534884, positive is 0.465116
Sentiment classification finished
Topic 9: Percentage o