<a href="https://colab.research.google.com/github/MuneneMutuma/Sentiment-Analysis/blob/main/Sentiment_Analysis_Using_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Using NLTK

## Setup: Installation and Imports

### Install nltk

In [None]:
!pip install nltk



### Import nltk and other libraries

#### Import and Download nltk builtin modules



In [None]:
import nltk
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#### Import other nltk libraries and helper libraries



In [None]:
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize

from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

import re, string, random

## Getting Data From Twitter

In this, we are getting positive and negavite tweets. These are already classified. Then we also get sample tweets for tesing.

In [None]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

## Normalizing Data and Lemmatization

Process of converting data to cannonical form. For example, read, reading, reads, all are from the root "read". So it would be well to normalize them to that form.

In [None]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))

    return lemmatized_sentence

## Remove Noise From Data

Noise is any part of text that does not add meaning to it, like commas, semi colons.

Also removes stop words, these are words like is, the, a which are very common in a language and would cause an overweight imbalance in prediction yet giving not added help in prediction. So they are also removed.

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&=#]|[!*\(\),]|\
                        (?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith("VB"):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

## Determine Word Density

In [None]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [None]:
freq_dist_pos = FreqDist(all_pos_words)

## Preparing Data for Model

### Convert Tokens to Dictionary

In [None]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

poistive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

### Splitting Dataset for Training and Testing Model

In [None]:
positive_dataset = [(tweet_dict, "Positive")
                    for tweet_dict in poistive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

### Train Classifier

In [None]:
classifier = NaiveBayesClassifier.train(train_data)

In [None]:
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.9946666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2041.9 : 1.0
                      :) = True           Positi : Negati =   1001.0 : 1.0
                follower = True           Positi : Negati =     37.1 : 1.0
                  arrive = True           Positi : Negati =     34.4 : 1.0
                     sad = True           Negati : Positi =     24.8 : 1.0
                     bam = True           Positi : Negati =     22.1 : 1.0
                     x15 = True           Negati : Positi =     16.0 : 1.0
                      aw = True           Negati : Positi =     14.7 : 1.0
               goodnight = True           Positi : Negati =     14.0 : 1.0
                followed = True           Negati : Positi =     13.9 : 1.0
None


## Testing

In [None]:
def test_sentiment(custom_tweet):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print(classifier.classify(dict([token, True] for token in custom_tokens)))

In [None]:
test_sentiment("I am very happy today!")

Positive


In [None]:
test_sentiment("Broke a sweat there")

Negative


In [None]:
test_sentiment("That was awfully bad!")

Negative


In [None]:
test_sentiment("I am not sure about the rain today, but I hope it doesn't rain")

Positive


In [None]:
test_sentiment("I had a bad score")

Negative


In [None]:
test_sentiment("That car aced it")

Positive
