In [35]:
import json
import os
import re
import math

In [25]:
with open("features.txt") as w:
    features = json.load(w)

In [26]:
a = set([k for k,v in features['fn']])
b = set([k for k,v in features['fp']])
c = set([k for k,v in features['ft']])
d = set([k for k,v in features['fd']])

In [27]:
neg_dict = {
        "can't": "can not",
        "couldn't": "could not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "would't": "would not",
        "shouldn't": "should not",
        "isn't": "is not",
        "aren't": "are not",
    }

In [47]:
def compute_feature_utility(docs, t, _class, feature_of_class):
    # Ntc --> t: document contain t, c: doc in class c
    # Ex: N10 -> doc that contains t (et = 1) and doc is not in class c (ec = 0)
    N00 = 0
    N11 = 0
    N01 = 0
    N10 = 0
    for c in _class:
        for doc in docs[c]:
            if c == feature_of_class:
                if t in doc:
                    N11 += 1
                else:
                    N01 += 1
            else:
                if t in doc:
                    N10 += 1
                else:
                    N00 += 1
    if N00 == 0:
        N00 += 1
    if N01 == 0:
        N01 += 1
    if N10 == 0:
        N10 += 1
    if N11 == 0:
        N11 += 1
    N = N00 + N01 + N10 + N11
    U = N11 * math.log2((N * N11) / ((N10 + N11) * (N01 + N10))) / N
    U += N01 * math.log2((N * N01) / ((N00 + N01) * (N10 + N11))) / N
    U += N10 * math.log2((N * N10) / ((N10 + N11) * (N00 + N10))) / N
    U += N00 * math.log2((N * N00) / ((N00 + N01) * (N00 + N01))) / N
    return U


def select_feature(docs, k, feature_of_class):
    nb_class = docs.keys()
    vocab = set([])
    L = []
    for _class in docs:
        for doc in docs[_class]:
            vocab.update(doc)
    for t in vocab:
        U = compute_feature_utility(docs, t, nb_class, feature_of_class)
        L.append((U, t))
    L.sort(reverse=True)
    return L

In [48]:
def tokenize(doc):
    # filter stopwords later
    stop_words = {}
    token_list = [x.strip() for x in re.sub(r"[^a-zA-Z ]", "", doc).split(" ") if len(x) > 0]
    tokens = set(token_list)
    for token in token_list:
        if token.lower() in stop_words and token in tokens:
            tokens.remove(token)
    return tokens

def read_file(class_folder):
    docs = []
    for fold in class_folder:
        if fold.is_dir():
            folder = os.scandir(fold.path)
            for training_file in folder:
                if ".txt" not in training_file.path or "README" in training_file.path:
                    continue
                with open(training_file.path, "r", encoding="utf-8") as file_reader:
                    doc = ""
                    for line in file_reader:
                        doc += line.lower()
                    sub_doc = multiple_replace(neg_dict, doc)
                    docs.append(sub_doc)
    return docs


def multiple_replace(dict, text):
    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [60]:
def remove_rare_feature(docs):
    count_token = {}
    for i, doc in enumerate(docs):
        token_set = tokenize(doc)
        for t in token_set:
            if t not in count_token:
                count_token[t] = 0
            count_token[t] += 1

    count_token = dict(filter(lambda elem: elem[1] >= 5, count_token.items()))
    sorted_x = sorted(count_token.items(), key=lambda kv: kv[1], reverse=True)
    return set([k for k, _ in sorted_x])

In [61]:
path_to_train = './train_data'
neg_dec = os.scandir(os.path.join(path_to_train, "negative_polarity/deceptive_from_MTurk"))
neg_tr = os.scandir(os.path.join(path_to_train, "negative_polarity/truthful_from_Web"))
pos_dec = os.scandir(os.path.join(path_to_train, "positive_polarity/deceptive_from_MTurk"))
pos_tr = os.scandir(os.path.join(path_to_train, "positive_polarity/truthful_from_TripAdvisor"))

neg_dec_docs = read_file(neg_dec)
neg_tr_docs = read_file(neg_tr)
pos_dec_docs = read_file(pos_dec)
pos_tr_docs = read_file(pos_tr)

neg_dec_docs_set = remove_rare_feature(neg_dec_docs)
neg_tr_docs_set = remove_rare_feature(neg_tr_docs)
pos_dec_docs_set = remove_rare_feature(pos_dec_docs)
pos_tr_docs_set = remove_rare_feature(pos_tr_docs)

neg_dec_tokenized_docs = []
for doc in neg_dec_docs:
    neg_dec_tokenized_docs.append(tokenize(doc).intersection(neg_dec_docs_set))

neg_tr_tokenized_docs = []
for doc in neg_tr_docs:
    neg_tr_tokenized_docs.append(tokenize(doc).intersection(neg_tr_docs_set))

pos_dec_tokenized_docs = []
for doc in pos_dec_docs:
    pos_dec_tokenized_docs.append(tokenize(doc).intersection(pos_dec_docs_set))

pos_tr_tokenized_docs = []
for doc in pos_tr_docs:
    pos_tr_tokenized_docs.append(tokenize(doc).intersection(pos_tr_docs_set))

In [95]:
ft = select_feature({"truthful": neg_tr_tokenized_docs + pos_tr_tokenized_docs, "deceptive": neg_dec_tokenized_docs + pos_dec_tokenized_docs}, 10000, "truthful")
fd = select_feature({"truthful": neg_tr_tokenized_docs + pos_tr_tokenized_docs, "deceptive": neg_dec_tokenized_docs + pos_dec_tokenized_docs}, 10000, "deceptive")
fp = select_feature({"positive": pos_dec_tokenized_docs + pos_tr_tokenized_docs, "negative": neg_dec_tokenized_docs + neg_tr_tokenized_docs}, 10000, "positive")
fn = select_feature({"positive": pos_dec_tokenized_docs + pos_tr_tokenized_docs, "negative": neg_dec_tokenized_docs + neg_tr_tokenized_docs}, 10000, "negative")

In [106]:
ft_set = set([v for k,v in ft if k > 1.5])
len(ft_set)

791

In [107]:
fd_set = set([v for k,v in fd if k > 1.5])
len(fd_set)

787

In [108]:
fp_set = set([v for k,v in fp if k > 1.5])
len(fp_set)

800

In [109]:
fn_set = set([v for k,v in fn if k > 1.5])
len(fn_set)

783

In [118]:
a

{'staying',
 'stopped',
 'screen',
 'second',
 'daughter',
 'cut',
 'myself',
 'following',
 'matter',
 'toilet',
 'etc',
 'then',
 'money',
 'come',
 'flat',
 'entering',
 'waiting',
 'window',
 'arguing',
 'upon',
 'traffic',
 'millennium',
 'spot',
 'anyway',
 'attention',
 'ive',
 'recommendations',
 'frequent',
 'called',
 'the',
 'walls',
 'changed',
 'priced',
 'work',
 'door',
 'eyes',
 'complaining',
 'shame',
 'condescending',
 'everything',
 'everywhere',
 'acknowledge',
 'bug',
 'sure',
 'missing',
 'main',
 'stale',
 'beware',
 'occupied',
 'turns',
 'items',
 'definitely',
 'speed',
 'requests',
 'replaced',
 'chain',
 'ignored',
 'opinion',
 'special',
 'difficult',
 'questions',
 'accommodate',
 'hardly',
 'white',
 'sat',
 'color',
 'visited',
 'sounds',
 'recently',
 'policy',
 'great',
 'went',
 'met',
 'carpets',
 'he',
 'bathrooms',
 'swim',
 'past',
 'problems',
 'view',
 'try',
 'add',
 'nothing',
 'never',
 'th',
 'avoid',
 'able',
 'might',
 'arrival',
 'realiz