In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import glob, os, operator, re
import string
from itertools import islice
from collections import Counter
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\awant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
stop_words = set(stopwords.words('english'))
print(len(stop_words))

179


In [18]:
neg_text = []
pos_text = []

directory_list = ['neg', 'pos']
for folder_name in directory_list:
    read_files = glob.glob('./movie_review_data/'+folder_name+'/*.txt')
    for f in read_files:
        with open(f, "r+", encoding="utf-8") as infile:
            text = infile.read()
            infile.close()
            
            if folder_name == 'neg':
                neg_text.append(text)
            elif folder_name == 'pos':
                pos_text.append(text)

In [19]:
def preprocess_text(text):
    text = re.compile(r'<[^>]+>').sub(' ', str(text))
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().split()
    return text

def find_unigrams(text):
    return Counter(text)

def find_bigrams(text):
    return Counter(zip(text, islice(text, 1, None)))

def process_ngrams(ngrams, mode):
    keys = ngrams.keys()
    pngrams = ngrams.copy()
    
    for key in list(keys):
        for stopword in stop_words:
            if mode == 'uni':
                if stopword == key or ngrams[key] < 100:
                    del pngrams[key]
            elif mode == 'bi':
                 if stopword in list(key) or ngrams[key] < 50:
                    del pngrams[key]
    return pngrams

In [20]:
negative = preprocess_text(neg_text)
positive = preprocess_text(pos_text)

neg_uni = find_unigrams(negative)
neg_bi = find_bigrams(negative)

pos_uni = find_unigrams(positive)
pos_bi = find_bigrams(positive)

In [21]:
pneg_uni = process_ngrams(neg_uni,'uni')
pneg_bi = process_ngrams(neg_bi,'bi')
ppos_uni = process_ngrams(pos_uni,'uni')
ppos_bi = process_ngrams(pos_bi,'bi')

In [24]:
uni_vocab = list(set(pneg_uni + ppos_uni))
bi_vocab = list(set(pneg_bi + ppos_bi))

In [41]:
def find_pmi(neg, pos, vocab):
    neg_pmi = {}
    pos_pmi = {}
    
    n = sum(neg.values()) #total tokens in negative text
    p = sum(pos.values()) #total tokens in positive text
    total = n+p
    
    P_neg = n/total #probability of negative class #not used for now
    P_pos = p/total #probability of positive class #not used for now
    
    for word in vocab:
        neg_count = pos_count = 0
        if word in neg.keys():
            neg_count = neg[word]
        if word in pos.keys():
            pos_count = pos[word]

        P_neg_word = neg_count/total #P(word,neg)
        P_pos_word = pos_count/total #P(word,pos)

        P_word = (neg_count+pos_count)/total
        try:
            neg_pmi[word] = math.log(P_neg_word/(P_word * P_neg))
        except:
            neg_pmi[word] = 0
        try:
            pos_pmi[word] = math.log(P_pos_word/(P_word * P_pos))
        except: 
            pos_pmi[word] = 0

    return neg_pmi, pos_pmi

In [42]:
neg_uni_pmi, pos_uni_pmi = find_pmi(neg_uni, pos_uni, uni_vocab)

print("Negative unigram PMI")
print(dict(sorted(neg_uni_pmi.items(), key=lambda x: x[1], reverse=True)[:20]))
print("\nPositive unigram PMI")
print(dict(sorted(pos_uni_pmi.items(), key=lambda x: x[1], reverse=True)[:20]))

Negative unigram PMI
{'boll': 0.6990501447864198, 'seagal': 0.6845875647331474, 'mstk': 0.6765879727521168, 'unwatchable': 0.6687934760151752, 'incoherent': 0.6540508408730659, 'unfunny': 0.6416110301570299, 'waste': 0.6384418026770593, 'blah': 0.6323602703848789, 'horrid': 0.6246552240104736, 'pointless': 0.6229866870450217, 'drivel': 0.6213711488092838, 'atrocious': 0.61940586812433, 'redeeming': 0.6168002787204079, 'prom': 0.614234491764741, 'lousy': 0.6134593156013486, 'worst': 0.6099491551514659, 'laughable': 0.605797205376077, 'awful': 0.6046985385299606, 'poorly': 0.6045921439729667, 'remotely': 0.6009447442380652}

Positive unigram PMI
{'edie': 0.6795874359385078, 'paulie': 0.6696371050853397, 'felix': 0.6411211551107117, 'matthau': 0.6263429214196956, 'victoria': 0.6157359639519752, 'flawless': 0.6123352089293405, 'mildred': 0.6119287874646929, 'astaire': 0.5942275869873511, 'superbly': 0.5859019518611848, 'perfection': 0.5798880757154198, 'wonderfully': 0.5629829508015687, 'c

In [43]:
neg_bi_pmi, pos_bi_pmi = find_pmi(neg_bi, pos_bi, bi_vocab)

print("Negative bigram PMI")
print(dict(sorted(neg_bi_pmi.items(), key=lambda x: x[1], reverse=True)[:20]))
print("\nPositive bigram PMI")
print(dict(sorted(pos_bi_pmi.items(), key=lambda x: x[1], reverse=True)[:20]))

Negative bigram PMI
{('prom', 'night'): 0.7068933270733797, ('even', 'worth'): 0.7068933270733797, ('terrible', 'movie'): 0.7068933270733797, ('worst', 'films'): 0.6978434915534619, ('uwe', 'boll'): 0.6944708070748229, ('total', 'waste'): 0.6897988937140799, ('worst', 'movies'): 0.6877539868626824, ('movie', 'sucks'): 0.6874752412162782, ('badly', 'written'): 0.6870906997772003, ('terrible', 'film'): 0.6870906997772003, ('bad', 'bad'): 0.6833628296631858, ('worst', 'film'): 0.6818293584101638, ('awful', 'movie'): 0.6765879775780509, ('worst', 'movie'): 0.6733706350347364, ('money', 'back'): 0.66661942793544, ('dont', 'waste'): 0.6665652816864079, ('complete', 'waste'): 0.664333712654584, ('power', 'rangers'): 0.6581031629039479, ('poor', 'acting'): 0.657296385934008, ('horrible', 'movie'): 0.6556000326858293}

Positive bigram PMI
{('red', 'sox'): 0.6795874312425673, ('gunga', 'din'): 0.6795874312425672, ('rob', 'roy'): 0.6676112401958515, ('midnight', 'cowboy'): 0.6633269103707867, ('n

In [44]:
print("PMI of good and bad in Negative Class")
print(neg_uni_pmi['good'])
print(neg_uni_pmi['bad'])

print("\nPMI of good and bad in Positive Class")
print(pos_uni_pmi['good'])
print(pos_uni_pmi['bad'])

PMI of good and bad in Negative Class
-0.003467875940748813
0.4790195370818304

PMI of good and bad in Positive Class
0.0033629582298617816
-0.9111501666679825
