##  PART 1
### POS Tagger

In [92]:
import nltk
from nltk.corpus import treebank

nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
sentences = treebank.tagged_sents()

In [65]:
# I have defined funtion to create a POS Tagger below
#input = sentences
def pos_tagger_without_viterbi(sentences):

    # creating a null dict fo Transition and Emission Probabilities
    transition_prob = {}
    emission_prob = {}

    count_tag = {}

    # Initialy count the occurrences of tag transitions and  tags  in each sentence
    for sentence in sentences:
        #Initail start tag is '<s>'
        prev_tag = '<s>'

        for word, current_tag in sentence:
            if prev_tag not in transition_prob:
                transition_prob[prev_tag] = {}
            else:
            # Using the .get function I retrive value associated with the current tag
                transition_prob[prev_tag][current_tag] = transition_prob[prev_tag].get(current_tag, 0) + 1

            # Counting the number of emissions
            if current_tag not in emission_prob:
                emission_prob[current_tag] = {}
            else:
                emission_prob[current_tag][word] = emission_prob[current_tag].get(word, 0) + 1

            # Counting the number of current tag occurrences
            count_tag[current_tag] = count_tag.get(current_tag, 0) + 1

            prev_tag = current_tag

            #This updates the current tag with f how many times each specific POS tag occurs in a given text.
    print('Count of transition tags:')
    print(transition_prob)
    print('Count of emission tags:')
    print(emission_prob)
    print('The total number of words for each tag:')
    print(count_tag)

    # Once the  I find the counts of each of these the next task is to find the associated probablities
    for prev_tag in transition_prob:
        total_transitions_of_prev_tag = sum(transition_prob[prev_tag].values())
        for current_tag in transition_prob[prev_tag]:
            transition_prob[prev_tag][current_tag] /= total_transitions_of_prev_tag

    for current_tag in emission_prob:
        total_emissions_of_current_tag = sum(emission_prob[current_tag].values())
        for word in emission_prob[current_tag]:
            emission_prob[current_tag][word] /= total_emissions_of_current_tag
    print('Probalitiy values of transition tags:')
    print(transition_prob)
    print('Probalitiy values of emission tags:')
    print(emission_prob)

    return transition_prob, emission_prob, count_tag

In [66]:

transition_probabilities, emission_probabilities, tag_counts =  pos_tagger_without_viterbi(sentences)

Count of transition tags:
{'<s>': {'NNP': 773, 'DT': 905, 'IN': 505, 'PRP': 245, 'EX': 17, '``': 296, 'CD': 33, 'RBR': 3, 'NNS': 183, 'NN': 174, 'JJ': 143, 'JJR': 12, 'RB': 175, 'WRB': 25, 'CC': 201, '-NONE-': 82, 'VBG': 17, 'WDT': 2, '-LRB-': 7, 'WP': 14, 'PRP$': 29, 'JJS': 6, 'NNPS': 10, 'VBZ': 9, 'TO': 5, 'VBN': 7, 'LS': 7, "''": 1, ':': 11, 'PDT': 3, 'UH': 1, 'MD': 1, '$': 5, 'VB': 3, 'RBS': 2, 'VBD': 1}, 'NNP': {',': 1441, 'CD': 190, 'NNP': 3596, 'VBZ': 345, 'VBG': 7, 'NN': 521, 'WDT': 5, 'NNS': 213, 'IN': 410, 'CC': 362, 'POS': 472, '.': 475, 'VBD': 609, 'MD': 96, 'TO': 39, 'VBP': 36, ':': 67, 'RB': 72, 'JJ': 83, "''": 31, '-NONE-': 53, 'NNPS': 162, 'DT': 21, 'JJR': 1, 'VBN': 7, '-RRB-': 31, '-LRB-': 25, '$': 2, 'WP': 6, 'PRP': 5, 'VB': 8, '``': 7, 'RP': 1, 'SYM': 1, 'WRB': 2}, ',': {'MD': 54, 'DT': 660, 'CD': 114, 'VBD': 268, 'NNS': 129, 'NN': 237, 'VBZ': 140, 'IN': 384, '``': 65, "''": 265, 'VBG': 90, 'NNP': 669, 'PRP': 189, 'WDT': 174, 'VBN': 110, 'JJ': 211, 'RB': 275, 'TO': 1

### Definig the viterbi algorithm

In [67]:
def viterbi(sentence, transition_prob, emission_prob):
    tags = list(emission_prob.keys())
    n = len(sentence)

    # Initialize matrices for probabilities and backpointers
    dp = [[0.0] * n for _ in range(len(tags))]
    backpointer = [[-1] * n for _ in range(len(tags))]

    # Initialize starting probabilities
    for i, tag in enumerate(tags):
        dp[i][0] = transition_prob['<s>'].get(tag, 0) * emission_prob[tag].get(sentence[0], 0)
        backpointer[i][0] = -1  # No previous tag for the first word

    # Fill in the matrices
    for j in range(1, n):
        for i, tag in enumerate(tags):
            max_prob = 0
            max_prev_tag = tags[0]  # Set a default value as the first tag
            for k, prev_tag in enumerate(tags):
                prob = dp[k][j - 1] * transition_prob.get(prev_tag, {}).get(tag, 0) * emission_prob[tag].get(sentence[j], 0)
                if prob > max_prob:
                    max_prob = prob
                    max_prev_tag = prev_tag
            dp[i][j] = max_prob
            backpointer[i][j] = tags.index(max_prev_tag)

    # Backtrack to find the best tag sequence
    best_tags = [0] * n
    best_tags[-1] = dp.index(max(dp, key=lambda x: x[-1]))
    for j in range(n - 1, 0, -1):
        best_tags[j - 1] = backpointer[best_tags[j]][j]

    return [tags[i] for i in best_tags]

Testing the algo for some random sentence

In [68]:

test_sent = "This is a very interesting and useful course for me"
output_words = nltk.word_tokenize(test_sent)
output_tags = viterbi(output_words, transition_probabilities, emission_probabilities)
print(list(zip(output_words, output_tags)))


[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('interesting', 'JJ'), ('and', 'CC'), ('useful', 'JJ'), ('course', 'NN'), ('for', 'IN'), ('me', 'PRP')]


## PART 2
### Vannila Sentiment Analyzer

In [69]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [70]:
movie_reviews.categories()

['neg', 'pos']

Exploring the movie_reviw dataset

In [71]:
positive_label=movie_reviews.fileids(categories='pos')

In [72]:
movie_reviews.raw(fileids=positive_label[0])

'films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this seem

Creating a document to store the structured data

In [73]:
documents = []

for label in movie_reviews.categories():

    fileids = movie_reviews.fileids(label)
    for fileid in fileids:
        words = list(movie_reviews.words(fileid))
        document = (words, label)
        documents.append(document)


Print the first few documents as an example
---



In [74]:
for i in range(3):
    words, label = documents[i]
    print(f"Label: {label}")
    print("Words:", ' '.join(words[:25]))
    print()

Label: neg
Words: plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the

Label: neg
Words: the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie

Label: neg
Words: it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late



Splitting the data into training and validation

In [75]:
train_split, test_split = train_test_split(documents, test_size=0.5, random_state=12)
val_split, test_split = train_test_split(test_split, test_size=0.5, random_state=12)


Vectorization using TF-IDF

In [76]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([' '.join(doc) for doc, _ in train_split])
X_val = vectorizer.transform([' '.join(doc) for doc, _ in val_split])
X_test = vectorizer.transform([' '.join(doc) for doc, _ in test_split])

In [77]:
y_train = [label for _, label in train_split]
y_val = [label for _, label in val_split]
y_test = [label for _, label in test_split]

Training a simple Naive bayes Classifier

In [78]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [79]:
predict_val = classifier.predict(X_val)

Evaluating Validation Accuracy

In [80]:
print("Validation Accuracy:", accuracy_score(y_val, predict_val))
print(classification_report(y_val, predict_val))


Validation Accuracy: 0.744
              precision    recall  f1-score   support

         neg       0.91      0.57      0.70       262
         pos       0.66      0.94      0.78       238

    accuracy                           0.74       500
   macro avg       0.79      0.75      0.74       500
weighted avg       0.79      0.74      0.74       500



We get a decest accuracy on Validation set

Evaluating Test set

In [81]:
test_predictions = classifier.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

Test Accuracy: 0.732
              precision    recall  f1-score   support

         neg       0.89      0.55      0.68       258
         pos       0.66      0.93      0.77       242

    accuracy                           0.73       500
   macro avg       0.78      0.74      0.72       500
weighted avg       0.78      0.73      0.72       500



## PART 3
### Improved Sentiment Analyser

In [82]:
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import word_tokenize
import numpy as np

Creating a function to extract POS tags from the tagged data

In [83]:
def extract_pos(tagged_docs):
    pos_features = []
    for tagged_words in tagged_docs:
        pos_tags = ['NOUN'] * len(tagged_words)
        pos_features.append(' '.join(pos_tags))
    return pos_features



In [84]:
X_train_pos = extract_pos(train_split)
X_val_pos = extract_pos(val_split)
X_test_pos = extract_pos(test_split)


Similar to Part 2 we do vectorization of words

In [85]:

vectorizer = TfidfVectorizer()
X_train_words_tfidf = vectorizer.fit_transform([' '.join(doc) for doc, _ in train_split])
X_val_words_tfidf = vectorizer.transform([' '.join(doc) for doc, _ in val_split])
X_test_words_tfidf = vectorizer.transform([' '.join(doc) for doc, _ in test_split])

#### Code to combine TF-IDF word embeddings and POS tag features

In [86]:
X_train_combined = np.hstack((X_train_words_tfidf.toarray(), StandardScaler().fit_transform(vectorizer.transform(X_train_pos).toarray())))
X_val_combined = np.hstack((X_val_words_tfidf.toarray(), StandardScaler().fit_transform(vectorizer.transform(X_val_pos).toarray())))
X_test_combined = np.hstack((X_test_words_tfidf.toarray(), StandardScaler().fit_transform(vectorizer.transform(X_test_pos).toarray())))


Preparing the target variables

In [87]:
y_train = [category for _, category in train_split]
y_val = [category for _, category in val_split]
y_test = [category for _, category in test_split]


In [88]:
classifier = MultinomialNB()
classifier.fit(X_train_combined, y_train)


In [89]:

val_predictions = classifier.predict(X_val_combined)


Evaulating Validation Accuracy

In [90]:
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print(classification_report(y_val, val_predictions))


Validation Accuracy: 0.706
              precision    recall  f1-score   support

         neg       0.93      0.48      0.63       262
         pos       0.62      0.96      0.76       238

    accuracy                           0.71       500
   macro avg       0.78      0.72      0.69       500
weighted avg       0.78      0.71      0.69       500



Evaluating Test Accuracy

In [91]:
test_predictions = classifier.predict(X_test_combined)
print("Test Accuracy:", accuracy_score(y_test, test_predictions))
print(classification_report(y_test, test_predictions))


Test Accuracy: 0.702
              precision    recall  f1-score   support

         neg       0.93      0.46      0.61       258
         pos       0.62      0.96      0.76       242

    accuracy                           0.70       500
   macro avg       0.78      0.71      0.69       500
weighted avg       0.78      0.70      0.68       500

