# Sentiment Analysis

<div>
<img src="https://i1.wp.com/thedatascientist.com/wp-content/uploads/2018/10/sentiment-analysis.png?ssl=1" height="650" width="650"/>
</div>

- Twitter US Airline Sentiment
- Analyze how travelers in February 2015 expressed their feelings on Twitter

Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

# Import requirements

In [1]:
import os
import numpy
import csv
import re
import joblib
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, accuracy_score)

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load pre-trained BERT model

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

# Step 1: Open Tweets.csv & extract features and labels

In [None]:
def tweet_extractor(csv_file, max_num = None):
    """
    input: csv_file csv file
    max_num = max number of examples
    Takes in a CSV file and extracts tweets and labels
    output: 2 arrays (tweets and labels) 
    """
    tweets = []
    labels = []
    # open csv
    with open(csv_file, encoding='utf-8') as csv_file:
        # read csv file
        csv_reader = csv.reader(csv_file, delimiter=',')
        try:
            # loop in csv
            for index, row in enumerate(csv_reader):
                # skip first row
                if index > 0:
                    # extract label
                    label = row[1]
                    # extract confidence
                    sentiment_conf = float(row[2])
                    # extract tweet
                    tweet = row[10]
                    # check if conf is >= 0.80
                    if sentiment_conf >= 0.80:
                        # append tweets and labels
                        tweets.append(tweet)
                        labels.append(label)
                        
                    else:
                        # "low conf"
                        pass
                
                # break if len tweets == max_num
                if len(tweets) == max_num:
                    print("max examples: {}".format(len(tweets)))
                    break

        except Exception as error:
            print("Exception: {}".format(error))
    
    return tweets, labels

path_to_dataset = "../dataset/Tweets.csv"
tweets, labels = tweet_extractor(path_to_dataset, max_num = 1000)

# Plot label distribution

In [None]:
def label_distribution(labels):
    """
    input: array
    Takes in an array with labels
    output: plot of label distribution 
    """

    # get label names and count
    label_names, label_dist_count = numpy.unique(labels, return_counts = True)
    label_names = label_names.astype("str")

    print(label_dist_count)
    print(label_names)

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.set_ylabel('Count')
    ax.set_title('Label distribution')
    ax.bar(label_names,label_dist_count)
    plt.show()

label_distribution(labels)

# Step 2: Remove duplicates

In [5]:
def duplicate_remover(tweets, labels):
    """
    input: array
    Takes in 2 arrays containing tweets and labels
    and removes duplicated tweets
    output: 2 arrays (tweets and labels)
    """
    # index_list = List of all unique indexes in tweets list
    __, index_list = numpy.unique(tweets, return_index=True, axis = 0)
    
    # Set empty lists for unique tweets(tweet + label)
    unique_tweets = []
    unique_labels = []

    for index, (transcript, label) in enumerate(zip(tweets, labels)):

        # if index in index_list
        if index in index_list:
            unique_tweets.append(transcript)
            unique_labels.append(label)

        else:
            pass
    
    return unique_tweets, unique_labels

unique_tweets, unique_labels = duplicate_remover(tweets, labels)

# Step 3: Preprocessing
- Remove white spaces

In [6]:
def processor(unique_tweets_array):
    """
    input: array
    Takes in an array with sentences & cleans the sentences
    output: array """
    
    clean_tweets_array = []
    
    for sentence in unique_tweets_array:      
        # Remove white spaces
        sentence = re.sub(' +', ' ',sentence).strip()
        
        # Append cleaned sentences
        clean_tweets_array.append(sentence)
    
    return clean_tweets_array

unique_tweets = processor(unique_tweets)

# Step 4: extract dense features 
- extract contextual embeddings

In [None]:
def bert_vectorizer(tweet_list):
    """
    input: array
    Takes in an array with sentences 
    and outputs a vectorized array
    output: array 
    """
    
    # Fit vectorizer
    features = []
    
    for text in (tqdm(tweet_list,
                      total = len(tweet_list),
                      desc ="progress")):
        
        encoded_input = tokenizer(text, return_tensors='tf')
        output = model(encoded_input)[0]
        output = output.numpy()[0][0]
        features.append(output)

    return features

train_features = bert_vectorizer(unique_tweets)

# Checkpoint 

In [None]:
print("shape of train_features: {}".format(numpy.array(train_features).shape))
print("example of vectorized training data: {}".format(train_features[0][:10]))

# Step 5: Split data in train and test

In [9]:
x_train, x_val, y_train, y_val = train_test_split(train_features, 
                                                  unique_labels, 
                                                  test_size = 0.20, 
                                                  stratify = unique_labels)

# Step 6: Classification algorithms

# MLP model

In [None]:
clf = MLPClassifier(verbose=True)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_val)

# Model evaluation

In [None]:
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))
print()
print(accuracy_score(y_val,y_pred))

# Save model to disk

In [None]:
path_to_model_dir = "../models/"
model_filename = 'BERT_MLP_model'

def save_to_disk(model_object, path_to_model_dir, filename):
    # open joblib
    with open(path_to_model_dir + filename + ".joblib", "wb") as file:
        # dump object
        joblib.dump(model_object, file)

# save model 
save_to_disk(clf, path_to_model_dir, model_filename)

print("model is successfully saved")