In [12]:
import nltk
import re
from nltk.probability import FreqDist, LidstoneProbDist, ConditionalFreqDist, ConditionalProbDist
from nltk import ngrams
from collections import Counter
import copy
import pandas as pd
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends
# from nltk.lm import MLEProbDist, LaplaceProbDist

In [13]:
#Importing Text File
with open('Text Corpus.txt', 'r') as f:
    text = f.read()
    
print(text)

FileNotFoundError: [Errno 2] No such file or directory: 'Text Corpus.txt'

In [3]:
#Preprocessing Function
def preprocess_text(text):
    """
    Lowercases all words and removes non-alphanumeric characters.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = nltk.word_tokenize(text)
    return words

# Q1

# 1.1

In [4]:
#1.1 Unsmoothed Unigram Language Model
def train_unigram(text):
    """
    Unigram Training, Will riturn Dictionary with word freqs
    """
    words = preprocess_text(text)
    frequencies = nltk.FreqDist(words)
    return frequencies

def predict_unigram(text, model):
    """
    Using the trained unigram (Dictionary) will predict the probability
    WIll return float
    """
    words = preprocess_text(text)
    probability = 1.0
    for word in words:
        if word in model:
            probability *= model.freq(word)
        else:
            probability *= 0.0
    return probability

# Output

model = train_unigram(text)

words = text.split(" ")
words.remove("<s>")
words.remove("</s>")
words.remove("</s>\n<s>")
words.remove("</s>\n<s>")

uniqueWords = set(words)

print("Unsmoothed Unigram Language Model Probability: ")
for word in uniqueWords:
    probability = predict_unigram(word, model)
    print('P('+word+'):', probability)

Unsmoothed Unigram Language Model Probability: 
P(read): 0.14285714285714285
P(He): 0.09523809523809523
P(I): 0.047619047619047616
P(my): 0.047619047619047616
P(different): 0.047619047619047616
P(book): 0.14285714285714285
P(a): 0.14285714285714285
P(Danielle): 0.047619047619047616


# 1.2

In [24]:
#1.2 Smoothed Unigram Language Model
def train_unigram(text, gamma=0.1):
    """
    Unigram Training, Will riturn Dictionary with word freqs
    """
    words = preprocess_text(text)
    n = len(words)
    frequencies = FreqDist(words)
    vocabulary_size = len(frequencies)
    model = {}
    for word, freq in frequencies.items():
        model[word] = LidstoneProbDist(frequencies, gamma, vocabulary_size)
    return model

def predict_unigram(text, model):
    """
    Using the trained unigram (Dictionary) will predict the probability
    WIll return float
    """
    words = preprocess_text(text)
    probability = 1.0
    for word in words:
        if word in model:
            probability *= model[word].prob(word)
        else:
            probability *= model[word].prob('<UNK>')
    return probability


# Output
model = train_unigram(text, gamma=0.1)

words = text.split(" ")
words.remove("<s>")
words.remove("</s>")
words.remove("</s>\n<s>")
words.remove("</s>\n<s>")

uniqueWords = set(words)

print("Unsmoothed Unigram Language Model Probability: ")
for word in uniqueWords:
    probability = predict_unigram(word, model)
    print('P('+word+'):', probability)


Unsmoothed Unigram Language Model Probability: 
P(a): 0.14155251141552513
P(book): 0.14155251141552513
P(I): 0.05022831050228311
P(different): 0.05022831050228311
P(my): 0.05022831050228311
P(read): 0.14155251141552513
P(Danielle): 0.05022831050228311
P(He): 0.09589041095890412


# Q2

# 2.1

In [30]:
#2.1 Unsmoothed Bigram Language Model
#PreProcessing
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
sentences = nltk.word_tokenize(text)

# Generate bigrams
bigrams = list(ngrams(sentences, 2))

# Count frequency of bigram
frequencies = Counter(bigrams)

# Calculate probabilities
probabilities = {}
for bigram, frequency in frequencies.items():
    first_word = bigram[0]
    probability = frequency / sentences.count(first_word)
    probabilities[bigram] = probability

# Print the bigram probabilities
df = pd.DataFrame.from_dict(probabilities, orient='index', columns=['Probability'])

# Add separate columns for the first and second words of the bigram
df[['Word 1', 'Word 2']] = pd.DataFrame(df.index.tolist(), index=df.index)

# Print the resulting DataFrame
print(df)

                   Probability     Word 1     Word 2
(s, he)               0.333333          s         he
(he, read)            1.000000         he       read
(read, a)             1.000000       read          a
(a, book)             0.666667          a       book
(book, s)             0.666667       book          s
(s, s)                0.333333          s          s
(s, i)                0.166667          s          i
(i, read)             1.000000          i       read
(a, different)        0.333333          a  different
(different, book)     1.000000  different       book
(book, my)            0.333333       book         my
(my, danielle)        1.000000         my   danielle
(danielle, s)         1.000000   danielle          s


# 2.2

In [6]:
#Q2.2 Smoothed Bigram Languange Model
#PreProcessing
import nltk
from nltk.util import ngrams
from nltk.probability import LidstoneProbDist, FreqDist

sentences = [
    ['s', 'he', 'read', 'a', 'book', 's'],
    ['i', 'read', 'a', 'different', 'book', 's'],
    ['s', 'he', 'read', 'a', 'book', 'my', 'danielle', 's']
]

# Collect all bigrams from the sentences
all_bigrams = []
for sent in sentences:
    bigrams = ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
    all_bigrams += list(bigrams)

# Calculate the frequency distribution of bigrams
bigram_freq = FreqDist(all_bigrams)

# Calculate the vocabulary size
vocab_size = len(set([word for sent in sentences for word in sent]))

# Define the smoothing parameter
gamma = 0.5

# Define the smoothed probability distribution
smoothed_prob = LidstoneProbDist(bigram_freq, gamma, vocab_size)

# Calculate the probabilities of each bigram
bigram_prob = {}
for bigram in bigram_freq:
    prob = smoothed_prob.prob(bigram[1], bigram[0])
    bigram_prob[bigram] = prob

# Print the probabilities of each bigram
for bigram, prob in bigram_prob.items():
    print(f'{bigram}: {prob}')



ValueError: 
The number of bins in a Lidstone distribution (9) must be greater than or equal to
the number of bins in the FreqDist used to create it (14).

# Q3

In [14]:
#Q3 Sentence Probability

def train_unigram(text, gamma=0.1):
    """
    Trains an unsmoothed unigram language model on the input text.
    gamma is the smoothing parameter and defaults to 0.1.
    Returns a dictionary with unigram probabilities.
    """
    words = preprocess_text(text)
    frequencies = FreqDist(words)
    total = len(words)
    vocab_size = len(set(words))
    model = {}
    for word in frequencies:
        model[word] = frequencies[word]/total
    return model

def train_bigram(text, gamma=0.1):
    """
    Trains a smoothed bigram language model on the input text.
    gamma is the smoothing parameter and defaults to 0.1.
    Returns a dictionary with bigram probabilities.
    """
    words = preprocess_text(text)
    bigrams = list(ngrams(words, 2))
    frequencies = FreqDist(bigrams)
    total = len(bigrams)
    vocab_size = len(set(words))
    model = {}
    for bigram in frequencies:
        model[bigram] = LidstoneProbDist(frequencies, gamma).prob(bigram) 
    return model

def sentence_probability_unigram(sentence, model):
    """
    Calculates the probability of the input sentence using the trained unigram model.
    Returns the probability as a float.
    """
    words = preprocess_text(sentence)
    probability = 1.0
    for word in words:
        if word in model:
            probability *= model[word]
        else:
            probability *= 0.0
    return probability

def sentence_probability_bigram(sentence, model):
    """
    Calculates the probability of the input sentence using the trained bigram model.
    Returns the probability as a float.
    """
    words = preprocess_text(sentence)
    bigrams = list(ngrams(words, 2))
    probability = 1.0
    for bigram in bigrams:
        if bigram in model:
            probability *= model[bigram]
        else:
            probability *= 0.0
    return probability

# Output

unigram_model = train_unigram(text)
bigram_model = train_bigram(text)

sentences = text.split("\n")

uniqueWords = set(words)

print("Unigram and Bigram Language Model Sentence Probability: ")
for sentence in sentences:
    unigram_probability = sentence_probability_unigram(sentence, unigram_model)
    bigram_probability = sentence_probability_bigram(sentence, bigram_model)
    print(f"Sentence: {sentence}\nUnigram probability: {unigram_probability}\nBigram probability: {bigram_probability}\n")

Unigram and Bigram Language Model Sentence Probability: 
Sentence: s he read a book s
Unigram probability: 2.266629267283756e-05
Bigram probability: 1.3751208137469954e-05

Sentence: s i read a different book s
Unigram probability: 5.396736350675608e-07
Bigram probability: 1.0206435810159401e-07

Sentence: s he read a book my danielle s
Unigram probability: 5.139748905405342e-08
Bigram probability: 1.9210577005037543e-08



In [18]:
#PreProcessing
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
sentences = nltk.word_tokenize(text)

# Generate bigrams from the input sentence
bigrams = list(ngrams(sentences, 2))

# Count the frequency of each bigram
frequencies = Counter(bigrams)

# Calculate the bigram probabilities
probabilities = {}
for bigram, frequency in frequencies.items():
    first_word = bigram[0]
    probability = frequency / sentences.count(first_word)
    probabilities[bigram] = probability

# Print the bigram probabilities
df = pd.DataFrame.from_dict(probabilities, orient='index', columns=['Probability'])

# Add separate columns for the first and second words of the bigram
df[['Word 1', 'Word 2']] = pd.DataFrame(df.index.tolist(), index=df.index)

# Print the resulting DataFrame
print(df)

                   Probability     Word 1     Word 2
(s, he)               0.333333          s         he
(he, read)            1.000000         he       read
(read, a)             1.000000       read          a
(a, book)             0.666667          a       book
(book, s)             0.666667       book          s
(s, s)                0.333333          s          s
(s, i)                0.166667          s          i
(i, read)             1.000000          i       read
(a, different)        0.333333          a  different
(different, book)     1.000000  different       book
(book, my)            0.333333       book         my
(my, danielle)        1.000000         my   danielle
(danielle, s)         1.000000   danielle          s


# Q4

# 4.1

In [1]:
# Import library
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Owen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
#read data set
path = ("Musical_Instruments_Reviews.csv")
data = pd.read_csv(path)
df=pd.DataFrame(data)

In [3]:
#Labeling prediced sentiments
for index, row in df.iterrows():
  sent=sia.polarity_scores(row['Reviews'])
  if (sent['compound'] == 0):
      sentiment="Neutral"
  else:
      if (sent['compound'] > -0):
            sentiment="Positive"
      else:
            sentiment="Negative"
  df.at[index,'Sentiment'] = sentiment

In [4]:
#export predicted resulting data set
df.to_csv("Musical_Instruments_Reviews_Result.csv",index=False)

In [5]:
print(sia.polarity_scores(row['Reviews']))

{'neg': 0.085, 'neu': 0.727, 'pos': 0.188, 'compound': 0.905}


# 4.2

In [6]:
#import library
import nltk
nltk.download('punkt')
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import collections
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#Read data set from Q4.1
path = ("Musical_Instruments_Reviews_Result.csv")
newdata = pd.read_csv(path)
df=pd.DataFrame(newdata)

In [8]:
#Amount of reviews

#Number of Positive
print(len(df[df['Sentiment']=='Positive']))

#Number of Negative
print(len(df[df['Sentiment']=='Negative']))

#Number of Neutral
print(len(df[df['Sentiment']=='Neutral']))

9070
996
188


In [9]:
#Data pre processing

def word_feats(words):
    return dict([(word, True) for word in words])

#create word tokenizer features for Positive, negative, and neutral reviews
neg_data = df[df['Sentiment'] == 'Negative']
pos_data = df[df['Sentiment'] == 'Positive']
neut_data=df[df['Sentiment'] == 'Neutral']
pos_feats = [(word_feats(word_tokenize(review)), 'pos') for review in pos_data['Reviews']]
neg_feats = [(word_feats(word_tokenize(review)), 'neg') for review in neg_data['Reviews']]
neut_feats = [(word_feats(word_tokenize(review)), 'neut') for review in neut_data['Reviews']]

#Creating training and test data set
negcutoff = int(len(neg_feats) * 3 / 4)
poscutoff = int(len(pos_feats) * 3 / 4)
neutcutoff = int(len(neut_feats) * 3 / 4)
trainfeats = neg_feats[:negcutoff] + pos_feats[:poscutoff]+neut_feats[:neutcutoff]
testfeats = neg_feats[negcutoff:] + pos_feats[poscutoff:]+neut_feats[neutcutoff:]

In [10]:
#Creating NaiveBayes Model
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

In [11]:
#Performance Measures

#Accuracy
print("\nModel Accuracy: ",nltk.classify.accuracy(classifier, testfeats))

#Precision, Recall, F1
print ("\nPositive Precision: ",nltk.scores.precision(refsets['pos'], testsets['pos']))
print ('Positive recall:', nltk.scores.recall(refsets['pos'], testsets['pos']))
print ('Positive F1 Score:', nltk.scores.f_measure(refsets['pos'], testsets['pos']))


Model Accuracy:  0.8794851794071763

Positive Precision:  0.8933012434817489
Positive recall: 0.9819223985890653
Positive F1 Score: 0.9355177483721908
