# Sentiment Analysis using Logistic Regression

## Imports and loading dataset

In [1]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import random
import numpy as np

In [2]:
# Uncomment below to download the nltk twitter dataset
# nltk.download('twitter_samples')

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print(type(positive_tweets))
print(type(positive_tweets[0]))

<class 'list'>
<class 'str'>


In [5]:
print(negative_tweets[7])

@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln


# Preprocessing

Steps involved in preprocessing text data:

1. Lowercasing all words
2. Remove punctuations
3. Tokenizing into words
4. Remove stop words (After we have the tokenized words)
5. Stemming to reduce size of words dictionary

In [None]:
# Uncomment below to download stopwords
# nltk.download('stopwords')

In [6]:
import re            # Regular Expressions
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

### 1. Remove Hashtags and links inside the tweets

In [7]:
tweet = positive_tweets[311]
print(tweet)

@ForkH Hi, may you like play my newest gamejam game, i would be very happy about it :) http://t.co/8J5voDDHcs


In [8]:
tweet = re.sub(r'^RT[\s]+', '', tweet)                # Remove retweet text "RT"
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)   # Remove https://, website links
tweet = re.sub(r'#', '', tweet)                      # Remove hashtags
print(tweet)

@ForkH Hi, may you like play my newest gamejam game, i would be very happy about it :) 


### 2. Tokenize the string

In [9]:
print(tweet)
# Use preserve_case = False to also lowercase all strings
tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)
tweet_tokens = tokenizer.tokenize(tweet)
print(tweet_tokens)

@ForkH Hi, may you like play my newest gamejam game, i would be very happy about it :) 
['hi', ',', 'may', 'you', 'like', 'play', 'my', 'newest', 'gamejam', 'game', ',', 'i', 'would', 'be', 'very', 'happy', 'about', 'it', ':)']


### 3. Remove stopwords

In [10]:
english_stopwords = stopwords.words('english')
print(english_stopwords)
print('\n')
print(string.punctuation)    # Access punctuations using string.punctuation

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
print(tweet_tokens)

cleaned_tweet = []
for token in tweet_tokens:
    if token in english_stopwords or token in string.punctuation:
        continue
    else:
        cleaned_tweet.append(token)

print(cleaned_tweet)

['hi', ',', 'may', 'you', 'like', 'play', 'my', 'newest', 'gamejam', 'game', ',', 'i', 'would', 'be', 'very', 'happy', 'about', 'it', ':)']
['hi', 'may', 'like', 'play', 'newest', 'gamejam', 'game', 'would', 'happy', ':)']


### 4. Stemming
Stemming means converting a word to its most general form: Happy, Happier, Happiest, all essentially convey same meaning, so we can stem them all to "happi", reducing the dictionary size

In [12]:
print(cleaned_tweet)

stemmer = PorterStemmer()
stemmed_tweet = []

for token in cleaned_tweet:
    stemmed_tweet.append(stemmer.stem(token))

print(stemmed_tweet)

['hi', 'may', 'like', 'play', 'newest', 'gamejam', 'game', 'would', 'happy', ':)']
['hi', 'may', 'like', 'play', 'newest', 'gamejam', 'game', 'would', 'happi', ':)']


## Preprocessing Function
Combining all the above steps into a single function

In [13]:
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess_tweet(tweet):
    
    '''
        Input: Tweet, a string
        Output: A processed, tokenized (array) of words, after removing stopwords and stemming
    '''
    
    # Re operations
    tweet = re.sub(r'^RT[\s]+', '', tweet)               # Remove retweet text "RT"
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)   # Remove https://, website links
    tweet = re.sub(r'#', '', tweet)                      # Remove hashtags
    
    # Tokenize
    tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)
    tweet_tokens = tokenizer.tokenize(tweet)

    # Remove Stopwords
    cleaned_tweet = []
    for token in tweet_tokens:
        if token in english_stopwords or token in string.punctuation:
            continue
        else:
            cleaned_tweet.append(token)
        
    # Stemming
    stemmed_tweet = []

    for token in cleaned_tweet:
        stemmed_tweet.append(stemmer.stem(token))
        
    return stemmed_tweet

In [17]:
for i in range(3):
    n = np.random.randint(0, 4999)
    pos_or_neg = np.random.randint(0, 2)
    
    if(pos_or_neg):
        print(positive_tweets[n])
        print(preprocess_tweet(positive_tweets[n]))
    else:
        print(negative_tweets[n])
        print(preprocess_tweet(negative_tweets[n]))
    print('\n')

Enjoy a cute baby panda! :) http://t.co/9m6RWHsQEr http://t.co/UO77pIgatL
['enjoy', 'cute', 'babi', 'panda', ':)']


@thenwchica thank you :) 🍰
['thank', ':)', '🍰']


See you on monday 2EMT!!! :)
['see', 'monday', '2emt', ':)']




# Building word frequencies

Goal: Build a dictionary of words (unique) and then see how many times this word appears in positive tweets and in negative tweets.

In [18]:
print(len(positive_tweets))
print(len(negative_tweets))

labels = np.append(np.ones(len(positive_tweets)), np.zeros(len(negative_tweets)))
print(labels)

5000
5000
[1. 1. 1. ... 0. 0. 0.]


In [19]:
def build_frequencies(tweets, y_list):
    
    '''
        Inputs: 
        Tweets: The entire list of tweets, each of which is a string
        y_list: Labels for tweets
        
        Output:
        The word frequency dictionary
    
    '''

    # y: Sentiment label of each tweet (Actual labels)
    y_list = np.squeeze(y_list).tolist()        # Squeezing to a list so that we can zip it.
    
    word_freqs = {}
    for y, tweet in zip(y_list, tweets):
        preprocessed_tweet = preprocess_tweet(tweet)
        for word in preprocessed_tweet:
            pair = (word, y)
            if(word_freqs.get(pair) is None):
                word_freqs[pair] = 1
            else:
                word_freqs[pair] += 1
                
    return word_freqs

In [20]:
all_tweets = positive_tweets + negative_tweets
freqs = build_frequencies(all_tweets, labels)

Now testing these on a few words:

In [21]:
test_words = ['happy', 'sad', 'furious', 'oh', 'no', 'well', 'magnificent', 'power', 'beautiful', 'house', 'shit', ':)', ':(']

for i in range(len(test_words)):
    test_words[i] = stemmer.stem(test_words[i])

for i in range(len(test_words)):
    word = test_words[i]
    positives = freqs.get((word, 1))
    negatives = freqs.get((word, 0))
    if(positives is None): positives = 0
    if(negatives is None): negatives = 0
    print(f"Word is {word}, positive frequencies: {positives}, negatives: {negatives}.")

Word is happi, positive frequencies: 211, negatives: 25.
Word is sad, positive frequencies: 5, negatives: 123.
Word is furiou, positive frequencies: 0, negatives: 0.
Word is oh, positive frequencies: 53, negatives: 92.
Word is no, positive frequencies: 0, negatives: 0.
Word is well, positive frequencies: 81, negatives: 56.
Word is magnific, positive frequencies: 2, negatives: 0.
Word is power, positive frequencies: 7, negatives: 6.
Word is beauti, positive frequencies: 50, negatives: 11.
Word is hous, positive frequencies: 7, negatives: 16.
Word is shit, positive frequencies: 13, negatives: 36.
Word is :), positive frequencies: 3568, negatives: 2.
Word is :(, positive frequencies: 1, negatives: 4571.


# Logistic Regression Model

## 1. Create Training and Test sets

In [22]:
train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]
test_pos  = positive_tweets[4000:]
test_neg  = negative_tweets[4000:]

X_train = train_pos + train_neg
X_test  = test_pos + test_neg

In [23]:
# It is important to make the shape: np.ones((len(train_pos), 1)), and axis = 0!
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [24]:
print("train_y.shape = " + str(y_train.shape))  # Len = 4000 * 2 
print("test_y.shape = " + str(y_test.shape))    # Len = 1000 * 2

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


Note that the shape is 8000, 1 and 2000, 1 meaning its a column vector, as expected, since the features matrix X has the feautures of all training examples as m rows stacked

In [25]:
freqs = build_frequencies(X_train, y_train)
# Build frequencies of all words avaialable in all tweets, now represented as X_train
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11338


## 2. Sigmoid

In [26]:
def sigmoid(z):
    # Use np.exp() to exponentiate an entire array
    return 1/(1 + np.exp(-z))

In [27]:
# verify that when the model predicts close to 1, but the actual label is 0, the loss is a large positive value
a1 = -1 * (1 - 0) * np.log(1 - 0.9999) # loss is about 9.2

# verify that when the model predicts close to 0 but the actual label is 1, the loss is a large positive value
a2 = -1 * np.log(0.0001) # loss is about 9.2

# verify that when the model predicts close to 1 and the actual label is 1, the loss is a small positive value
a3 = -1 * np.log(0.999) # loss is about 9.2

print(a1, a2, a3)

# What we Predict is the value INSIDE the log

9.210340371976294 9.210340371976182 0.0010005003335835344


## 3. Gradient Descent function

In [28]:
def gradient_descent(X, y, theta, alpha, num_iterations):
    
    '''
        Shapes:
        X      : m by 3, 3 columns for each training example. First column is 1, second column is sigma(pos), third sigma(neg)
        y      : m by 1, actual labels
        theta  : 3 by 1.  
        
        Outputs:
        J      : The cost function value after num_iterations
        theta  : The best model produced
    
    '''
    m = len(X) # Number of training examples: Each row represents one training example
    
    for i in range(0, num_iterations):
        z = np.dot(X, theta)
        
        h = sigmoid(z)
        
        J = (-1/m) * (np.dot(y.transpose(), np.log(h)) + np.dot((1 - y).transpose(), np.log(1 - h)))
        
        theta = theta - (alpha/m) * np.dot(X.transpose(), (h - y))
        
    J = float(J)
    
    return J, theta

In [29]:
# Testing Gradient descent
np.random.seed(1)
tempX = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
tempY = (np.random.rand(10, 1) > 0.35).astype(float)

print(tempX)
print(tempY)

[[1.00000000e+00 8.34044009e+02 1.44064899e+03]
 [1.00000000e+00 2.28749635e-01 6.04665145e+02]
 [1.00000000e+00 2.93511782e+02 1.84677190e+02]
 [1.00000000e+00 3.72520423e+02 6.91121454e+02]
 [1.00000000e+00 7.93534948e+02 1.07763347e+03]
 [1.00000000e+00 8.38389029e+02 1.37043900e+03]
 [1.00000000e+00 4.08904499e+02 1.75623487e+03]
 [1.00000000e+00 5.47751864e+01 1.34093502e+03]
 [1.00000000e+00 8.34609605e+02 1.11737966e+03]
 [1.00000000e+00 2.80773877e+02 3.96202978e+02]]
[[1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]]


In [30]:
tempJ, tempTheta = gradient_descent(tempX, tempY, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tempJ:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tempTheta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


## 4. Creating and extracting features from the text

For each tweet (training example), we need to make a 3x1 vector : [1, No. of positive words, No. of negative words]

In [31]:
# Extracts features from ONE tweet, and also hence processes it
def extract_features(tweet, freqs):
    
    '''
        Inputs:
        Tweet, a string (SINGLE tweet) (Not processed, just a string, we will process it from inside here)
        Freqs, the dictionary produced by calling build_frequencies()
        
        Output:
        x, the features for the given tweet, of the form [1, pos_counts, neg_counts]
    
    '''

    # Return x, the feature vector for a tweet
    tweet_processed = preprocess_tweet(tweet)
    
    x = np.zeros((1, 3))
    x[0, 0] = 1
    
    for word in tweet_processed:
        pos_counts = freqs.get((word, 1))
        if(pos_counts is None): pos_counts = 0
        x[0, 1] += pos_counts
        
        # increment the word count for the negative label 0
        neg_counts = freqs.get((word, 0))
        if(neg_counts is None): neg_counts = 0
        x[0, 2] += neg_counts
    
    assert(x.shape == (1, 3))
    return x

In [33]:
temp1 = extract_features(X_train[0], freqs)
print(temp1)

[[1.00e+00 3.02e+03 6.10e+01]]


In [35]:
temp2 = extract_features('Hello please extract features from this string :(', freqs)
print(temp2)

[[1.000e+00 1.340e+02 3.917e+03]]


## 5. Training the model on Training set
Todo: Stack the feature vectors for all training examples. This creates X
Call gradient descent on X, using y, to create an optimal theta

In [37]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)
Y = y_train

In [38]:
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

In [39]:
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24215666.
The resulting vector of weights is [7e-08, 0.00052391, -0.00055517]


In [40]:
print(theta)

[[ 7.25196582e-08]
 [ 5.23910559e-04]
 [-5.55170175e-04]]


## 6. Testing the model on Test set

In [41]:
# Given the Tweet, Freqs table and the 3-numbered-model theta, predict if a tweet is postive or negative
def predict_tweet_sentiment(tweet, freqs, theta):
    
    '''
        Input: A tweet, (string, not processed)
        Freqs: The dict obtained from calling build_frequencies()
        Theta: The model created by training on the training set
    
    '''
    
    X = extract_features(tweet, freqs) # NOTE: Preprocessing is already being done in the extract_features function
    
    y_pred = sigmoid(np.dot(X, theta))
    
    return y_pred

In [42]:
# Testing on Test-Dataset:
def test_set_logistic_regression(X_test, y_test, freqs, theta):
    
    '''
        Input:
        X_test and y_test: X_test includes the test set tweets, not processed
        freqs and theta
        
        Output:
        An accuracy vector comparing our predictions, y_pred, to actual values, y_test
    
    '''

    y_pred = []
    for tweet in X_test:
        y_pred_cur = predict_tweet_sentiment(tweet, freqs, theta)
        if(y_pred_cur > 0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
    
    accuracy_vector = (y_pred == np.squeeze(y_test)).sum()/len(X_test)
    return accuracy_vector

In [43]:
test_accuracy = test_set_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {test_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


### Prediction on any sample tweet

In [44]:
tweet = str(input("Enter your tweet..."))

prediction = predict_tweet_sentiment(tweet, freqs, theta)
if(prediction > 0.5):
    print("Positive tweet")
else:
    print("Negative tweet")

Enter your tweet...Hello :)
Positive tweet
