In [35]:
import json
import os
import re
import math

In [25]:
with open("features.txt") as w:
    features = json.load(w)

In [26]:
a = set([k for k,v in features['fn']])
b = set([k for k,v in features['fp']])
c = set([k for k,v in features['ft']])
d = set([k for k,v in features['fd']])

In [27]:
neg_dict = {
        "can't": "can not",
        "couldn't": "could not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "would't": "would not",
        "shouldn't": "should not",
        "isn't": "is not",
        "aren't": "are not",
    }

In [47]:
def compute_feature_utility(docs, t, _class, feature_of_class):
    # Ntc --> t: document contain t, c: doc in class c
    # Ex: N10 -> doc that contains t (et = 1) and doc is not in class c (ec = 0)
    N00 = 0
    N11 = 0
    N01 = 0
    N10 = 0
    for c in _class:
        for doc in docs[c]:
            if c == feature_of_class:
                if t in doc:
                    N11 += 1
                else:
                    N01 += 1
            else:
                if t in doc:
                    N10 += 1
                else:
                    N00 += 1
    if N00 == 0:
        N00 += 1
    if N01 == 0:
        N01 += 1
    if N10 == 0:
        N10 += 1
    if N11 == 0:
        N11 += 1
    N = N00 + N01 + N10 + N11
    U = N11 * math.log2((N * N11) / ((N10 + N11) * (N01 + N10))) / N
    U += N01 * math.log2((N * N01) / ((N00 + N01) * (N10 + N11))) / N
    U += N10 * math.log2((N * N10) / ((N10 + N11) * (N00 + N10))) / N
    U += N00 * math.log2((N * N00) / ((N00 + N01) * (N00 + N01))) / N
    return U


def select_feature(docs, k, feature_of_class):
    nb_class = docs.keys()
    vocab = set([])
    L = []
    for _class in docs:
        for doc in docs[_class]:
            vocab.update(doc)
    for t in vocab:
        U = compute_feature_utility(docs, t, nb_class, feature_of_class)
        L.append((U, t))
    L.sort(reverse=True)
    return L

In [119]:
def tokenize(doc):
    # filter stopwords later
    stop_words = {}
    token_list = [x.strip() for x in re.sub(r"[^a-zA-Z ]", "", doc).split(" ") if len(x) > 0]
    return token_list

def read_file(class_folder):
    docs = []
    for fold in class_folder:
        if fold.is_dir():
            folder = os.scandir(fold.path)
            for training_file in folder:
                if ".txt" not in training_file.path or "README" in training_file.path:
                    continue
                with open(training_file.path, "r", encoding="utf-8") as file_reader:
                    doc = ""
                    for line in file_reader:
                        doc += line.lower()
                    sub_doc = multiple_replace(neg_dict, doc)
                    docs.append(sub_doc)
    return docs


def multiple_replace(dict, text):
    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [124]:
def remove_rare_feature(docs):
    count_token = {}
    for i, doc in enumerate(docs):
        token_set = tokenize(doc)
        for t in token_set:
            if t not in count_token:
                count_token[t] = 0
            count_token[t] += 1

    count_token = dict(filter(lambda elem: elem[1] >= 5, count_token.items()))
    sorted_x = sorted(count_token.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_x

In [125]:
path_to_train = './train_data'
neg_dec = os.scandir(os.path.join(path_to_train, "negative_polarity/deceptive_from_MTurk"))
neg_tr = os.scandir(os.path.join(path_to_train, "negative_polarity/truthful_from_Web"))
pos_dec = os.scandir(os.path.join(path_to_train, "positive_polarity/deceptive_from_MTurk"))
pos_tr = os.scandir(os.path.join(path_to_train, "positive_polarity/truthful_from_TripAdvisor"))

neg_dec_docs = read_file(neg_dec)
neg_tr_docs = read_file(neg_tr)
pos_dec_docs = read_file(pos_dec)
pos_tr_docs = read_file(pos_tr)

neg_dec_docs_set = remove_rare_feature(neg_dec_docs)
neg_tr_docs_set = remove_rare_feature(neg_tr_docs)
pos_dec_docs_set = remove_rare_feature(pos_dec_docs)
pos_tr_docs_set = remove_rare_feature(pos_tr_docs)

In [131]:
def count_t(count_dict, docs):
    for t,n in docs:
        if t not in count_dict:
            count_dict[t] = 0
        count_dict[t] += n


neg_count = {}
count_t(neg_count, neg_dec_docs_set + neg_tr_docs_set)
pos_count = {}
count_t(pos_count, pos_dec_docs_set + pos_tr_docs_set)

{'the': 3987,
 'and': 2259,
 'i': 1417,
 'to': 1486,
 'a': 1670,
 'was': 1472,
 'hotel': 930,
 'in': 929,
 'my': 563,
 'of': 823,
 'chicago': 512,
 'at': 581,
 'we': 792,
 'is': 671,
 'for': 648,
 'it': 559,
 'room': 577,
 'with': 470,
 'stay': 400,
 'this': 449,
 'were': 453,
 'very': 452,
 'that': 362,
 'you': 323,
 'had': 335,
 'have': 292,
 'our': 335,
 'great': 391,
 'not': 355,
 'staff': 285,
 'all': 262,
 'as': 252,
 'they': 232,
 'there': 255,
 'are': 236,
 'from': 310,
 'so': 236,
 'rooms': 216,
 'on': 352,
 'would': 229,
 'stayed': 213,
 'be': 184,
 'service': 177,
 'will': 154,
 'me': 141,
 'time': 139,
 'when': 149,
 'but': 248,
 'comfortable': 161,
 'one': 148,
 'clean': 173,
 'just': 153,
 'an': 152,
 'definitely': 110,
 'could': 136,
 'friendly': 147,
 'place': 107,
 'like': 131,
 'recommend': 119,
 'again': 143,
 'city': 119,
 'beautiful': 100,
 'nice': 156,
 'out': 128,
 'if': 129,
 'area': 102,
 'wonderful': 108,
 'which': 112,
 'back': 99,
 'really': 111,
 'by': 112,

In [138]:
NP = sum([n for _,n in pos_count.items()])
NN = sum([n for _,n in neg_count.items()])
count_diff = {}
for t,n in pos_count.items():
    neg_n = neg_count.get(t, 1.0)
    count_diff[t] = math.log(n/NP) - math.log(neg_n/NN)
sorted_pos = sorted(count_diff.items(), key=lambda kv: kv[1], reverse=True)
sorted_pos

[('perfect', 4.8542380787924655),
 ('spacious', 4.766130811282199),
 ('enjoyed', 4.654905176171974),
 ('fantastic', 4.347420476424014),
 ('navy', 4.124276925109804),
 ('atmosphere', 3.8693846754810144),
 ('elegant', 3.7312343370001972),
 ('fabulous', 3.7312343370001972),
 ('flat', 3.69349400901735),
 ('dining', 3.6542732958640682),
 ('delicious', 3.6542732958640682),
 ('gym', 3.6542732958640682),
 ('pump', 3.6134513013438134),
 ('professional', 3.5264399243541833),
 ('awesome', 3.5264399243541833),
 ('convenient', 3.5264399243541833),
 ('gorgeous', 3.4799199087192907),
 ('favorite', 3.4311297445498585),
 ('incredible', 3.4311297445498585),
 ('buffet', 3.4311297445498585),
 ('future', 3.3798364501623084),
 ('reasonable', 3.3257692288920326),
 ('polite', 3.3257692288920326),
 ('workout', 3.3257692288920326),
 ('cozy', 3.3257692288920326),
 ('minute', 3.3257692288920326),
 ('helped', 3.2686108150520843),
 ('feeling', 3.2686108150520843),
 ('attractions', 3.2686108150520843),
 ('pet', 3.14

In [140]:
count_diff_neg = {}
for t,n in neg_count.items():
    pos_n = pos_count.get(t, 1.0)
    count_diff_neg[t] = -math.log(pos_n/NP) + math.log(n/NN)
sorted_neg = sorted(count_diff_neg.items(), key=lambda kv: kv[1], reverse=True)
sorted_neg

[('finally', 4.255950411233275),
 ('rude', 4.041939343482338),
 ('disappointed', 3.882090642540442),
 ('someone', 3.6421399729098507),
 ('manager', 3.6421399729098507),
 ('dirty', 3.625045539550551),
 ('worst', 3.5348944425562534),
 ('clerk', 3.496428161728457),
 ('later', 3.456422827114759),
 ('smelled', 3.4147501307141903),
 ('cold', 3.3487921629223925),
 ('smell', 3.325802644697694),
 ('toilet', 3.325802644697694),
 ('waiting', 3.27817459570844),
 ('cleaned', 3.228164175133778),
 ('sheets', 3.228164175133778),
 ('carpet', 3.228164175133778),
 ('smoke', 3.2021886887305175),
 ('walls', 3.175520441648356),
 ('card', 3.175520441648356),
 ('unfortunately', 3.148121467460242),
 ('slow', 3.148121467460242),
 ('either', 3.119950590493545),
 ('terrible', 3.0909630536202934),
 ('poor', 3.0909630536202934),
 ('wrong', 3.0611100904706117),
 ('loud', 3.030338431803858),
 ('tried', 3.030338431803858),
 ('despite', 3.030338431803858),
 ('charged', 3.030338431803858),
 ('rather', 2.998589733489278)