<a href="https://colab.research.google.com/github/LogeswaranSR/Sentiment-Analysis-of-Tweets/blob/main/Sentiment_Analysis_Logistic_Regression_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Packages

In [1]:
import nltk

nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [9]:
import numpy as np
import pandas as pd
import re
import string

from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Initialization

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = positive_tweets + negative_tweets
labels = np.append(np.ones((len(positive_tweets))), np.zeros((len(negative_tweets))))

In [4]:
tokenizer = TweetTokenizer(preserve_case=False,
                           strip_handles=True,
                           reduce_len=True)
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')
punctuations = string.punctuation

## Defining Functions

In [5]:
def process_tweet(tweets):
  processed_tweets = []

  for tweet in tweets:
    ## Removing all tweet symbols, hyperlinks, and hash symbols
    tweet = re.sub(r"^RT[\s]+", '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+','', tweet)
    tweet = re.sub(r"#", '', tweet)

    ## Removing Stopwords and punctuations and stemming each token
    tweet_tokens = []
    for token in tokenizer.tokenize(tweet):
      if token not in stopwords_english and token not in punctuations:
        stem_word = stemmer.stem(token)
        tweet_tokens.append(stem_word)
    processed_tweets.append(tweet_tokens)

  return processed_tweets

In [6]:
def build_freqs(processed_tweets, labels):
  labels_list = np.squeeze(labels).tolist()
  freqs = {}

  for y, tokens in zip(labels_list, processed_tweets):
    for t in tokens:
      pair = (t, y)
      if pair not in freqs:
        freqs[pair]=0
      freqs[pair]+=1

  return freqs

In [7]:
def extract_features(tweets, freqs):
  x = np.zeros((len(tweets), 3))
  i=0
  for tweet in tweets:
    x[i, 0] = 1
    for t in tweet:
      x[i, 1]+=freqs[(t, 1.0)] if (t, 1.0) in freqs else 0
      x[i, 2]+=freqs[(t, 0.0)] if (t, 0.0) in freqs else 0
    i+=1
  return x

## Data Preprocessing

In [8]:
processed_tweets = process_tweet(tweets)
freqs = build_freqs(processed_tweets, labels)
X = extract_features(processed_tweets, freqs)

array([1.000e+00, 3.887e+03, 7.200e+01])

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, labels,
                                                    test_size=0.2,
                                                    random_state=145,
                                                    shuffle=True)

In [11]:
print("Training Data Size:", X_train.shape[0])
print("Test Data Size:", X_test.shape[0])

Training Data Size: 8000
Test Data Size: 2000


## Logistic Regression

In [12]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [21]:
score = model.score(X_test, Y_test)
print("Accuracy: %.2f"%(score*100), '%')

Accuracy: 99.50 %


In [23]:
yhat = model.predict(X_test)
yhat

array([1., 0., 1., ..., 1., 1., 1.])

## Inference

In [24]:
def predict(tweet, model):
  processed_tweet = process_tweet([tweet])
  x = extract_features(processed_tweet, freqs)
  pred = model.predict(x)
  if(pred[0]):
    print("Positive")
  else:
    print("Negative")

In [26]:
predict("I am sad right now", model)

Negative


## Saving the Model

In [27]:
import pickle

with open('LogisticRegressionModel.pkl', 'wb') as file:
  pickle.dump(model, file)