In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Natural language processing libraries to prepare data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess

# Scikit-learn to prepare training/testing data and visulisation 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# TensorFlow to build model 
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

## Text Sentiment Analysis Using Natural Language Processing 

- Classification technique used to analyse text and categorise into classes, e.g positive, neutral or negative
- Process involves breaking data (text) down into components for example sentences or words
- Then assign each component a value to indicate sentiment

## Preparing Data

- We need to process and choose how to compartmentalise our data
- There are many options from sentences to words or buzzwords which may change the accuracy of the model
- In our analysis we define buzzwords as a class of words that our meaningful, e.g words with length greater than 2 or words that for example aren't pronouns or articles
- We use NLTK's list of 'Stopwords' and filter through to make a class of our own buzzwords
- Then will create an array of buzzwords that appear in each slice of data

In [43]:
nltk.download("stopwords")
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# Step 1: process data
def text_processing(df):

    # Remove punctuation
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
  
    # Update data
    df['Text Without Punctuation'] = df['Text'].apply(remove_punc)

    # Filter out stopwords
    def preprocess(text):
    
    final = []
    for token in gensim.utils.simple_preprocess(text):
        if len(token) > 2 and token not in stopwords:
        final.append(token)
    return final

    # Update data
    df['Text Without Punctuation and Stopwords'] = df['Text Without Punctuation'].apply(preprocess)
    
    return df

## Model Training & Testing
- First we will prepare our dataframe
- Then choose the proportion of data to train
- Then create our recurrent neural network
- Then visualise the result of our model

In [45]:
def sentiment_analysis(df):
    
    df = text_processing(df)

    # Get list and number of words in data
    words_list = []
    for words in df['Text Without Punctuation and Stopwords']:
        for word in words:
            words_list.append(word)
    
    num_words = len(set(words_list))
    
    X = df['Text Without Punctuation and Stopwords']
    y = df['Sentiment']

    # Step 2: tokenizing and padding

    # Choose proportion of data to train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

    # Tokenize words
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(X_train)
  
    train_sequences = tokenizer.texts_to_sequences(X_train)
    test_sequences = tokenizer.texts_to_sequences(X_test)

    # Padding
    padded_train = pad_sequences(train_sequences, maxlen = 29, padding = 'post', truncating = 'post')
    padded_test = pad_sequences(test_sequences, maxlen = 29, truncating = 'post')

    # Categorical 2D representation 
    cat_y_train = to_categorical(y_train, 2)
    cat_y_test = to_categorical(y_test, 2)

    # Sequential Model
    model = Sequential()

    # Embedding layer
    model.add(Embedding(num_words, output_dim = 512))

    model.add(LSTM(256))

    # Dense layers
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])

    # Step 3: train the model
    model.fit(padded_train, cat_y_train, batch_size = 32, validation_split = 0.2, epochs = 1)

    # Step 4: test the model
    predicted = model.predict(padded_test)

    prediction = []
    for i in predicted:
        prediction.append(np.argmax(i))

    original = []
    for i in cat_y_test:
        original.append(np.argmax(i))

    # Step 5: analysis of the model
    accuracy = accuracy_score(original, prediction)
  
    sns.heatmap(confusion_matrix(original, prediction), annot = True)

    return print("The model has an accuracy of:", accuracy), sns.heatmap(confusion_matrix(original, prediction), annot = True)