In [126]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator

import re                                  # library for regular expression operations
import string                              # for string operations
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
from nltk.stem import WordNetLemmatizer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.regularizers import l2

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import math

In [83]:
def sigmoid(z):
    
    g = 1 / (1 + math.e**-z)
    
    return g

In [84]:
def print_confusion_matrix(true_labels, predictions):
    #plot confusion matrix
    mat = confusion_matrix(true_labels, predictions)
    plot_confusion_matrix(conf_mat=mat)

    #Calculate precision, recall, f1_score

    #precision
    precision = precision_score(true_labels, predictions)

    #recall
    recall = recall_score(true_labels, predictions)

    #fl score
    f1 = f1_score(true_labels, predictions)

    #Print precision, recall, f1_score
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return

In [85]:
def calculate_accuracy(labels, predictions):
    
    # Calculate the number of correct predictions by comparing 'labels' and 'predictions'
    correct_predictions = np.sum(labels == predictions)
    
    # Calculate the total number of predictions
    total_predictions = len(labels)
    
    # Calculate the accuracy as a percentage
    accuracy_percentage = (correct_predictions / total_predictions) * 100.0
    
    print(f"Accuracy: {accuracy_percentage:.2f}%")

In [86]:
def get_predictions(feature_data):
    #run feature data through the trained model to get the linera activations from the output layer
    logits = model(feature_data)

    #convert the model outputs to probabilities by running through the sigmoid function
    logits = sigmoid(logits)

    # get predictions by converting output probabilities to True if >= 0.5, and False if < 0.5
    predictions = logits >= 0.5

    #convert True to 1 and False to 0
    predictions = [int(boolean) for boolean in predictions]

    return predictions

In [20]:
##clean tweets
def clean_tweets(tweet_lista):
    cleaned_list = []
    
    for tweet in tweet_lista:
        # remove old style retweet text "RT"
        tweet2 = re.sub(r'^RT[\s]+', '', tweet)
        
        # remove hyperlinks
        tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)
        
        # remove hashtags
        # only removing the hash # sign from the word
        tweet2 = re.sub(r'#', '', tweet2)

        cleaned_list.append(tweet2)

    return cleaned_list


In [18]:
def tokenize_tweets(tweet_list1):
    tokenized_tweets = []
    # instantiate tokenizer class
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

    for tweet in tweet_list1:
        # tokenize tweets
        tweet_tokens = tokenizer.tokenize(tweet)
        tokenized_tweets.append(tweet_tokens)

    return tokenized_tweets

In [31]:
def remove_stopwords(tweet_list2):
    
    cleaned_tweet_list = []
    
    for tweet_tokens in tweet_list2:
        cleaned_tweet = []
        #print(tweet_tokens)
        for word in tweet_tokens: # Go through every word in your tokens list
            if (word not in stopwords_english and word not in string.punctuation):  # remove punctuation # remove stopwords
                cleaned_tweet.append(word)
        #print(cleaned_tweet)
        cleaned_tweet_list.append(cleaned_tweet)

    return cleaned_tweet_list

In [39]:
def lemmatize_tweets(tweet_list):
    #Lemmatize
    lemmatized_tweets = []
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()

    for tweet in tweet_list:
        lemmatized = [lemmatizer.lemmatize(token) for token in tweet]
        lemmatized_tweets.append(lemmatized)

    return lemmatized_tweets


In [42]:
def stem_tweets(tweet_list):
    #Instantiate the Stemming Class
    stemmer = PorterStemmer()
    stemmed_tweets = []
    
    for tweet in tweet_list:
        #create an empty list
        tweets_stem = []
        for word in tweet:
            stem_word = stemmer.stem(word)
            tweets_stem.append(stem_word)
        
        stemmed_tweets.append(tweets_stem) 
    return stemmed_tweets

In [2]:
# downloads sample twitter dataset. uncomment the line below if running on a local machine.
nltk.download('twitter_samples')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/krcd58/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print('Number of positive tweets: ', len(all_positive_tweets))
print('Number of negative tweets: ', len(all_negative_tweets))

print('\nThe type of all_positive_tweets is: ', type(all_positive_tweets))
print('The type of a tweet entry is: ', type(all_negative_tweets[0]))

Number of positive tweets:  5000
Number of negative tweets:  5000

The type of all_positive_tweets is:  <class 'list'>
The type of a tweet entry is:  <class 'str'>


In [None]:
#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english') 

print('Stop words\n--------------------\n')
print(stopwords_english)

print('\nPunctuation\n----------------------\n')
print(string.punctuation)

In [12]:
#Create labels array
positive_labels = np.ones(5000)
negative_labels = np.zeros(5000)

tweet_labels = np.concatenate((positive_labels, negative_labels))
print(tweet_labels.shape)

(10000,)


In [13]:
#Combine positive and negative tweet datasets add a label column
all_tweets = all_positive_tweets + all_negative_tweets
all_tweets = np.array(all_tweets)
print(type(all_tweets))

<class 'numpy.ndarray'>


In [15]:
length = [len(i) for i in all_tweets]
print("The Average Review length is", np.mean(length))
print("The Standard Deviation is", round(np.std(length)))

The Average Review length is 68.5377
The Standard Deviation is 37
