# Sentiment Analysis Using Logistic Regression from Scratch

Sentiment analysis is done using the twitter dataset i.e twitter_samples from nltk

In [1]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

# Creating some helper function

process_tweet() -> cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.
 
build_freqs(): this counts how often a word in the 'corpus' (the entire set of tweets) was associated with a positive label '1' or a negative label '0', then builds the freqs dictionary, where each key is a (word,label) tuple, and the value is the count of its frequency within the corpus of tweets.

In [7]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

In [8]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [9]:
## creating a function process_tweet that takes tweet as parameter and returns a list of clean tweets

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)  # adding to list

    return tweets_clean

In [11]:
## Creating a function build_freqs that takes tweets and ys(y value) as parameter

def build_freqs(tweets, ys):
    # Convert np array to list since zip needs an iterable.
    # squeeze is necessary or the list ends up with one element.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1    
    return freqs

## Preparing the data 

The twitter_samples contains subsets of 5,000 positive tweets, 5,000 negative tweets, and the full set of 10,000 tweets.

In [12]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [15]:
## 20% for testing and 80% for training

# spliting the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [16]:
## Creating numpy array of positive and negative labels

## Also combining positive and negative labels

train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [17]:
# viewing the shape of train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [18]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11346


## Logistic Regression

#### implementing sigmoid function

In [19]:
'''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
'''
def sigmoid(z): 
    # calculating the sigmoid of z
    denom = 1 + np.exp(-z)
    h = 1/denom    
    return h

In [20]:
## implementing gradient descent function 

'''
Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
            
Output:
        J: the final cost
        theta: your final weight vector
    
'''

def gradientDescent(x, y, theta, alpha, num_iters):
    # m is the number of rows in matrix x
    m = len(x)
    
    for i in range(0, num_iters):
        
        # z is the dot product of x and theta
        z = np.dot(x,theta)
        
        # h is the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function which is J i.e. loss
        J = -1/m * (np.matmul(np.transpose(y),np.log(h)) + np.matmul(np.transpose(1-y),np.log(1-h)))

        # updating the weights theta after finding loss
        theta = theta - alpha/m * (np.matmul(np.transpose(x),(h-y)))

    J = float(J)
    return J, theta

## Feature Extraction

* Given a list of tweets, we will extract the features and store them in a matrix. We will extract two features.
    * The first feature is the number of positive words in a tweet.
    * The second feature is the number of negative words in a tweet. 

In [22]:
'''
Input: 
    tweet: a list of words for one tweet
    freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
Output: 
    x: a feature vector of dimension (1,3)
'''

def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector i.e bias,positive,negative
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1
    
    # looping through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)

    return x

## Training the model 

To train the model:
* The features for all training examples are stacked into a matrix `X`. 
* Then calling gradientDescent function

In [23]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Applying gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


## Predicting using Logistic Regression

Predicting whether a tweet is positive or negative.

* Given a tweet, process it, then extract the features.
* Applying the model's learned weights on the features to get the logits i.e. z.
* Applying the sigmoid to the logits to get the prediction (a value between 0 and 1).

In [24]:
'''
Input: 
    tweet: a string
    freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    theta: (3,1) vector of weights
Output: 
    y_pred: the probability of a tweet being positive or negative
'''

def predict_tweet(tweet, freqs, theta):
    # extracting the features of the tweet and storing it into x
    x = extract_features(tweet,freqs)
    
    # making the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [27]:
# Checking the sentiment of our own tweet below
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.81636424]])

## Checking performance using test set

After training the model using the training set above, now we need to check how our model might perform on real,
unseen data, by testing it against the test set.

### Creating test_logistic_regression

* Given the test data and the weights of our trained model, calculate the accuracy of our logistic regression model. 
* Using `predict_tweet()` function to make predictions on each tweet in the test set.
* If the prediction is > 0.5, we set the model's classification `y_hat` to 1, otherwise set the model's classification `y_hat` to 0.
* A prediction is accurate when `y_hat` equals `test_y`. Finally Sum up all the instances when they are equal and divide by `m`.

In [29]:
"""
Input: 
    test_x: a list of tweets
    test_y: (m, 1) vector with the corresponding labels for the list of tweets
    freqs: a dictionary with the frequency of each pair (or tuple)
    theta: weight vector of dimension (3, 1)
Output: 
    accuracy: (# of tweets classified correctly) / (total # of tweets)
"""

def test_logistic_regression(test_x, test_y, freqs, theta):
    
    y_hat = []  # the list for storing predictions
    
    for tweet in test_x:
        
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1.0)  # append 1.0 to the list
        else:
            y_hat.append(0)    # append 0 to the list

    # Here y_hat is a list, but test_y is (m,1) array
    # converting both to one-dimensional arrays in order to compare them using the '==' operator
    test_y_1d=np.squeeze(test_y)
    y_hat_1d=np.asarray(y_hat)
    final_val=np.sum(test_y_1d==y_hat_1d)
    accuracy = final_val/len(y_hat)
  
    return accuracy

In [31]:
## Finally calculating our model's accuracy

tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950
