### Importing necessary libraries

In [9]:
import nltk
from nltk.corpus import stopwords, twitter_samples # this is our dataset for our model
import string
import re
import pickle # this is used to load/store data from disk
import numpy as np

### Data preprocessing
Here, the <b>stemmer.stem()</b> function is responsible for identifying the root word, for example: learning -> learn, running -> run, ruins -> ruin, completed -> complete

In [13]:
def processing_tweets(tweet): # Removing all the unnecessary characters from our twitter dataset
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)


    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and
                word not in string.punctuation):
            stem_word = stemmer.stem(word)  
            tweets_clean.append(stem_word)

    return tweets_clean
  

Let us now create a function which will map each word to the sentiment class (postive or negative) along with the frequency of occurence

In [14]:
def build_freqs(tweets, ys): # ys is the label class, either 0 or 1
    
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    
    for y, tweet in zip(yslist, tweets):
        for word in processing_tweets(tweet):
            pair = (word, y)
            if pair in freqs:
               freqs[pair] += 1
            else:
               freqs[pair] = 1
    return freqs  

In [15]:
# Let us take a look at an example to demonstrate the use of this function

tweets = ['I am happy', 'I am sad', 'I am tired', 'I am frustrated']
ys = [1, 0, 0, 0]
print(build_freqs(tweets, ys))

{('happi', 1): 1, ('sad', 0): 1, ('tire', 0): 1, ('frustrat', 0): 1}


### Preparing the dataset for our model

In [16]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [18]:
# Splitting our data into train and test sets

test_pos = positive_tweets[4000:]
train_pos = positive_tweets[:4000]
test_neg = negative_tweets[4000:]
train_neg = negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [20]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [21]:
freqs = build_freqs(train_x, train_y)

In [23]:
freqs.__class__

dict

In [29]:
# Loading a tweet from our dataset and preprocessing it

print('The original tweet is as follows:', train_x[21])
print('This is the tweet after we have applied preprocessing to it:', processing_tweets(train_x[21]))

The original tweet is as follows: @rossbreadmore I've heard the Four Seasons is pretty dope. Penthouse, obvs #Gobigorgohome
Have fun y'all :)
This is the tweet after we have applied preprocessing to it: ["i'v", 'heard', 'four', 'season', 'pretti', 'dope', 'penthous', 'obv', 'gobigorgohom', 'fun', "y'all", ':)']


### Building the Logistic Regression Model

Let us first create a Sigmoid function

In [30]:
def sigmoid(z):
    zz = np.negative(z)
    h = 1/(1+np.exp(zz))
    return h

Now we will create the cost function along with the gradient descent

In [31]:
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        cost = -1. / m * (np.dot(y.transpose(), np.log(h)) + np.dot((1 - y).transpose(), np.log(1 - h)))
        theta = theta - (alpha / m) * np.dot(x.transpose(), (h - y))
    cost = float(cost)
    return cost, theta

Now we shall build a feature extractor function

In [32]:
def extract_features(tweet, freqs):
    word_l = processing_tweets(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1

    for word in word_l:
        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)
    assert (x.shape == (1, 3))
    return x

### Training and evaluating our Logistic Regression model

In [33]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)
    
Y = train_y

J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

We will now create a function for predicting the sentiment

In [34]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))

    return y_pred

In [35]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []

    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
    accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)

    return accuracy

In [36]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


Let us now predict the outcome with our own input given as a tweet

In [37]:
def pre(sentence):
    yhat = predict_tweet(sentence, freqs, theta)
    if yhat > 0.5:
        return 'Positive sentiment'
    elif yhat == 0:
        return 'Neutral sentiment'
    else:
        return 'Negative sentiment'

In [41]:
my_tweet = 'I am having a great day today'

res = pre(my_tweet)
print(res)

Positive sentiment
