In [1]:
import nltk
import math
import pandas as pd
from nltk.corpus import movie_reviews
from nltk.util import bigrams, trigrams
from nltk.probability import FreqDist
from nltk.corpus import stopwords


In [2]:
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\exerc\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\exerc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\exerc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load corpus
words = [w.lower() for w in movie_reviews.words()]

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
# words = [w for w in words if w.isalpha() and w not in stop_words]
words = [w for w in words if w.isalpha()]

N = len(words)
print("Total tokens:", N)


Total tokens: 1329753


In [4]:
bigram_list = list(bigrams(words))
trigram_list = list(trigrams(words))

bigram_fd = FreqDist(bigram_list)
trigram_fd = FreqDist(trigram_list)

word_fd = FreqDist(words)


In [5]:
def t_test_bigram(w1, w2):
    O = bigram_fd[(w1, w2)]
    if O == 0:
        return 0

    E = (word_fd[w1] * word_fd[w2]) / N
    return (O - E) / math.sqrt(O)


In [6]:
def statistical_t_test_bigram(w1, w2):
    O = bigram_fd[(w1, w2)]
    N_bigram = N - 1

    if N_bigram <= 1:
        return 0

    # Sample mean
    x_bar = O / N_bigram

    # Expected mean under independence
    mu = (word_fd[w1] / N) * (word_fd[w2] / N)

    # Sample variance (Bernoulli)
    s2 = x_bar * (1 - x_bar)

    if s2 == 0:
        return 0

    # Standard error
    se = math.sqrt(s2 / N_bigram)

    # t-statistic
    t_stat = (x_bar - mu) / se

    return t_stat


In [7]:
def t_test_trigram(w1, w2, w3):
    O = trigram_fd[(w1, w2, w3)]
    if O == 0:
        return 0

    E = (word_fd[w1] * word_fd[w2] * word_fd[w3]) / (N**2)
    return (O - E) / math.sqrt(O)


In [8]:
def chi_square_bigram(w1, w2):
    O11 = bigram_fd[(w1, w2)]
    O12 = word_fd[w1] - O11
    O21 = word_fd[w2] - O11
    O22 = N - (O11 + O12 + O21)

    total = O11 + O12 + O21 + O22

    E11 = (word_fd[w1] * word_fd[w2]) / total
    E12 = (word_fd[w1] * (total - word_fd[w2])) / total
    E21 = ((total - word_fd[w1]) * word_fd[w2]) / total
    E22 = ((total - word_fd[w1]) * (total - word_fd[w2])) / total

    chi2 = ((O11 - E11)**2 / E11) + ((O12 - E12)**2 / E12) + \
           ((O21 - E21)**2 / E21) + ((O22 - E22)**2 / E22)

    return chi2


In [9]:
test_bigrams = [
    ("highly", "recommended"),
    ("worth", "watching"),
    ("special", "effects"),
    ("great", "acting"),
    ("predictable", "plot"),
    ("cinematic", "banana"),
    ("plot", "twist"),
    ("plot", "refrigerator")
]


results_t = []

for w1, w2 in test_bigrams:
    results_t.append({
        "Bigram": f"{w1} {w2}",
        "Frequency": bigram_fd[(w1, w2)],
        "t-score": round(t_test_bigram(w1, w2), 3)
    })

pd.DataFrame(results_t)


Unnamed: 0,Bigram,Frequency,t-score
0,highly recommended,4,1.999
1,worth watching,25,4.976
2,special effects,387,19.658
3,great acting,13,3.439
4,predictable plot,9,2.931
5,cinematic banana,0,0.0
6,plot twist,22,4.666
7,plot refrigerator,0,0.0


In [10]:
for w1, w2 in test_bigrams:
    results_t.append({
        "Bigram": f"{w1} {w2}",
        "Frequency": bigram_fd[(w1, w2)],
        "t-score": round(statistical_t_test_bigram(w1, w2), 3)
    })

pd.DataFrame(results_t)

Unnamed: 0,Bigram,Frequency,t-score
0,highly recommended,4,1.999
1,worth watching,25,4.976
2,special effects,387,19.658
3,great acting,13,3.439
4,predictable plot,9,2.931
5,cinematic banana,0,0.0
6,plot twist,22,4.666
7,plot refrigerator,0,0.0
8,highly recommended,4,1.999
9,worth watching,25,4.976


In [11]:
results_chi = []

for w1, w2 in test_bigrams:
    results_chi.append({
        "Bigram": f"{w1} {w2}",
        "Frequency": bigram_fd[(w1, w2)],
        "Chi-Square": round(chi_square_bigram(w1, w2), 3)
    })

pd.DataFrame(results_chi)


Unnamed: 0,Bigram,Frequency,Chi-Square
0,highly recommended,4,8226.726
1,worth watching,25,5237.932
2,special effects,387,534326.665
3,great acting,13,256.619
4,predictable plot,9,373.836
5,cinematic banana,0,0.001
6,plot twist,22,4131.529
7,plot refrigerator,0,0.003


In [12]:
test_trigrams = [
    ("as", "a", "result"),
    ("one", "of", "the"),
    ("green", "ideas", "sleep")
]

results_tri = []

for w1, w2, w3 in test_trigrams:
    results_tri.append({
        "Trigram": f"{w1} {w2} {w3}",
        "Frequency": trigram_fd[(w1, w2, w3)],
        "t-score": round(t_test_trigram(w1, w2, w3), 3)
    })

pd.DataFrame(results_tri)


Unnamed: 0,Trigram,Frequency,t-score
0,as a result,81,8.994
1,one of the,1026,31.761
2,green ideas sleep,0,0.0
