In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_csv('imdb_master.csv', index_col=False, encoding='ISO-8859-1')
# Remove the first column
df = df.drop(df.columns[0], axis=1)
# Remove the rows with label == 'unsup'
df = df[df.label != 'unsup']

# split the data into train and test by the type column
train_df = df[df.type == 'train']
test_df = df[df.type == 'test']

X_train = train_df['review']
y_train = train_df['label']

X_test = test_df['review']
y_test = test_df['label']

In [4]:
# simpel preprocessing
#Process the review column line by line to do text preprocessing
def process_review(review):
    # remove the punctuations and numbers
    review = re.sub(r'[^A-Za-z]+', ' ', review)
    # convert the review to lower case
    review = review.lower()
    # remove the stopwords
    stop_words = set(stopwords.words('english'))
    # tokenize the words
    word_tokens = word_tokenize(review)
    filtered_review = [w for w in word_tokens if not w in stop_words]
    
    return filtered_review

# process the train and test reviews
X_train = X_train.apply(process_review)
X_test = X_test.apply(process_review)

In [7]:
X_train_lst = X_train.tolist()
X_test_lst = X_test.tolist()

#  我也不太明白，但是要把数据转换成这个格式才能用
X_train_lst = [' '.join(x) for x in X_train_lst]
X_test_lst = [' '.join(x) for x in X_test_lst]

X_train_lst[0]

'story man unnatural feelings pig starts opening scene terrific example absurd comedy formal orchestra audience turned insane violent mob crazy chantings singers unfortunately stays absurd whole time general narrative eventually making putting even era turned cryptic dialogue would make shakespeare seem easy third grader technical level better might think good cinematography future great vilmos zsigmond future stars sally kirkland frederic forrest seen briefly'

In [8]:
corpus = X_train_lst 

In [9]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)



In [10]:
len(word_freqs)

77679

In [11]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ġ']


In [12]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [13]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [14]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [15]:
pair_freqs = compute_pair_freqs(splits)

In [16]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('i', 'n') 361837


In [17]:
merges = {("i", "n"): "in"}
vocab.append("in")

In [18]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [19]:
vocab_size = 10000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [21]:
print(merges)



In [22]:
# sort the vocab by the len of the word
vocab.sort(key=len, reverse=True)
print(vocab)



In [23]:
len(vocab)

10000

In [24]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [34]:
test_sample = X_test_lst[0]
#test_sample = "I love this movie. It is so good."
test_sample

'mr costner dragged movie far longer necessary aside terrific sea rescue sequences care characters us ghosts closet costner character realized early forgotten much later time care character really care cocky overconfident ashton kutcher problem comes kid thinks better anyone else around shows signs cluttered closet obstacle appears winning costner finally well past half way point stinker costner tells us kutcher ghosts told kutcher driven best prior inkling foreshadowing magic could keep turning hour'

In [36]:
test_tokens = tokenize(test_sample)
print(test_tokens)

['m', 'r', 'Ġcost', 'ner', 'Ġdragged', 'Ġmovie', 'Ġfar', 'Ġlonger', 'Ġnecessary', 'Ġaside', 'Ġterrific', 'Ġsea', 'Ġrescue', 'Ġsequences', 'Ġcare', 'Ġcharacters', 'Ġus', 'Ġghosts', 'Ġcloset', 'Ġcost', 'ner', 'Ġcharacter', 'Ġrealized', 'Ġearly', 'Ġforgotten', 'Ġmuch', 'Ġlater', 'Ġtime', 'Ġcare', 'Ġcharacter', 'Ġreally', 'Ġcare', 'Ġcock', 'y', 'Ġover', 'con', 'f', 'ident', 'Ġash', 'ton', 'Ġk', 'ut', 'cher', 'Ġproblem', 'Ġcomes', 'Ġkid', 'Ġthinks', 'Ġbetter', 'Ġanyone', 'Ġelse', 'Ġaround', 'Ġshows', 'Ġsigns', 'Ġcl', 'utter', 'ed', 'Ġcloset', 'Ġobst', 'acle', 'Ġappears', 'Ġwinning', 'Ġcost', 'ner', 'Ġfinally', 'Ġwell', 'Ġpast', 'Ġhalf', 'Ġway', 'Ġpoint', 'Ġstinker', 'Ġcost', 'ner', 'Ġtells', 'Ġus', 'Ġk', 'ut', 'cher', 'Ġghosts', 'Ġtold', 'Ġk', 'ut', 'cher', 'Ġdriven', 'Ġbest', 'Ġprior', 'Ġin', 'k', 'ling', 'Ġfo', 'resh', 'ad', 'owing', 'Ġmagic', 'Ġcould', 'Ġkeep', 'Ġturning', 'Ġhour']


In [37]:
print('Before the tokenization, the length of the review is ',len(test_sample))
print('After the tokenization, the length of the review is ',len(test_tokens))

Before the tokenization, the length of the review is  504
After the tokenization, the length of the review is  97


In [39]:
# tokenize the X_train_lst and X_test_lst reviews
# warning: this will take a while
X_train_tokens = [tokenize(review) for review in X_train_lst]
X_test_tokens = [tokenize(review) for review in X_test_lst]


In [47]:
# update the X_train into X_train_tokens
for i in range(len(X_train_tokens)):
    words = X_train_tokens[i]
    X_train.iloc[i] = words

X_train.head()

25000    [story, Ġman, Ġun, natural, Ġfeelings, Ġpig, Ġ...
25001    [air, port, Ġstarts, Ġbrand, Ġnew, Ġlux, ury, ...
25002    [film, Ġlacked, Ġsomething, Ġput, Ġfinger, Ġfi...
25003    [s, or, ry, Ġeveryone, Ġknow, Ġsupposed, Ġart,...
25004    [l, ittle, Ġparents, Ġtook, Ġalong, Ġtheater, ...
Name: review, dtype: object

In [48]:
# create a vocabulary
vocab_train = {}
for review in X_train:
    for word in review:
        if word in vocab_train:
            vocab_train[word] += 1
        else:
            vocab_train[word] = 1

# sort the bag by the value
vocab_train = sorted(vocab_train.items(), key=lambda x: x[1], reverse=True)
# remove the words with frequency less than 2
vocab_train = [x for x in vocab_train if x[1] >= 2]
# # add the <UNK> token
# vocab.append(('<UNK>', 0))
vocab_train[:10]

[('Ġbr', 103823),
 ('Ġmovie', 42857),
 ('Ġfilm', 39687),
 ('Ġone', 26227),
 ('Ġlike', 20168),
 ('s', 18045),
 ('Ġgood', 15371),
 ('ed', 15270),
 ('ing', 14544),
 ('Ġtime', 12718)]

In [50]:
len(vocab_train)

9918

In [51]:
# create naive bayes model
def naive_bayes_model(X_train, y_train, vocab, k=1):
    # the key is the class and the value is the prior probability
    prior_prob = {}
    prior_prob['pos'] = y_train.value_counts()['pos'] / len(y_train)
    prior_prob['neg'] = y_train.value_counts()['neg'] / len(y_train)
    # the key is the class and the value is the conditional probability
    cond_prob = {}
    cond_prob['pos'] = {}
    cond_prob['neg'] = {}
    # calculate the conditional probability of each word given each class
    for word in vocab:
        # calculate the conditional probability of the word given the class 'pos'
        word_given_pos = X_train[y_train == 'pos'].apply(lambda x: x.count(word[0]))
        cond_prob['pos'][word[0]] = (word_given_pos.sum() + k) / (len(X_train[y_train == 'pos']) + k * len(vocab))
        # calculate the conditional probability of the word given the class 'neg'
        word_given_neg = X_train[y_train == 'neg'].apply(lambda x: x.count(word[0]))
        cond_prob['neg'][word[0]] = (word_given_neg.sum() + k) / (len(X_train[y_train == 'neg']) + k * len(vocab))

    return prior_prob, cond_prob

In [52]:
prior_prob, cond_prob = naive_bayes_model(X_train, y_train, vocab_train, k=1)

In [53]:
# predict the class of the test dataset
def get_y_pred(X_test, prior_prob, cond_prob, vocab):
    y_pred = []
    for review in X_test:
        # calculate the posterior probability of each class given the review
        pos_prob = np.log(prior_prob['pos'])
        neg_prob = np.log(prior_prob['neg'])
        for word in review:
            # if the word appear in the 'pos' class
            if word in cond_prob['pos']:
                pos_prob += np.log(cond_prob['pos'][word])
            if word in cond_prob['neg']:
                neg_prob += np.log(cond_prob['neg'][word]) 
        # predict the class of the review
        if pos_prob > neg_prob:
            y_pred.append('pos')
        else:
            y_pred.append('neg')
    return y_pred

In [54]:
y_pred = get_y_pred(X_test, prior_prob, cond_prob, vocab_train)

In [55]:
# calculate the accuracy of the model on the test dataset from the predicted class
accuracy = (y_pred == y_test).sum() / len(y_test)
accuracy

0.623

In [56]:
# calculate the confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,neg,pos
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
neg,4130,8370
pos,1055,11445


In [57]:
# build F1 score method
def f1_score(y_true, y_pred):
    # get the true positive, false positive, false negative
    tp = ((y_true == 'pos') & (y_pred == 'pos')).sum()
    fp = ((y_true == 'neg') & (y_pred == 'pos')).sum()
    fn = ((y_true == 'pos') & (y_pred == 'neg')).sum()
    # calculate the precision and recall
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    # calculate the F1 score
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [58]:
# convert the class to a numpy array
y_pred0 = np.array(y_pred)
y_test0 = np.array(y_test)

f1 = f1_score(y_test0, y_pred0)
f1

0.7083397802877921