In [None]:
import pickle
import json
import csv
import pickle
import re
import os, sys
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize as st
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from senticnet.senticnet import SenticNet

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

data_type = "reddit"
input_text_file = "text_clean_" + data_type + ".pkl"
input_com_file = "count_clean_" + data_type + ".pkl"
input_tf_file = "tfidf_vectorizer_" + data_type + ".pkl"
input_count_file = "count_vectorizer_" + data_type + ".pkl"
output_feature_filename = "features_" + data_type + ".csv"

with open(input_text_file, "rb") as f:
    text_final = pickle.load(f)
with open(input_com_file, "rb") as f:
    num_comments_final = pickle.load(f)
assert len(text_final) == len(num_comments_final)
print("Text List: ", len(text_final))
with open(input_tf_file, "rb") as f:
    tfidf_transformer = pickle.load(f)
with open(input_count_file, "rb") as f:
    count_vectorizer = pickle.load(f)
feature_names = count_vectorizer.get_feature_names()

In [None]:
def generate_lex_tag_score_dict():
    with open("expandedLexicon.txt", "r") as f:
        lexicon = f.readlines()

    lexicon_tag_score = {}
    for lex in lexicon:
        lex = lex.strip().lower().split("\t")
        assert len(lex) == 2
        word_pair = lex[0]
        score = float(lex[-1])
        word_pair = word_pair.split("_")
        tag = word_pair[-1]
        word = word_pair[0:-1]
        word = " ".join(word)
        lexicon_tag_score[(word, tag)] = score
    return lexicon_tag_score

stop_words = list(stopwords.words('english'))
sn = SenticNet()
sn_words = sn.__dict__['data'].keys()
lexicon_tag_score = generate_lex_tag_score_dict()
tag_map = {
    "JJ": "adj",
    "JJR": "adj",
    "JJS": "adj",
    "NN": "noun",
    "NNS": "noun",
    "NNP": "noun",
    "NNPS": "noun",
    "VB": "verb",
    "VBG": "verb",
    "VBD": "verb",
    "VBN": "verb",
    "VBP": "verb",
    "VPZ": "verb"
}

### Referenced from https://github.com/LCS2-IIITD/ChatterNet/tree/master/Baselines/CasPred
def get_hate_lexicon_score(text):
    """Score the text based on hate intensity lexicon."""
    text = nltk.word_tokenize(text.lower())
    text_tagged = nltk.pos_tag(text)
    sent_score = 0
    for tag_pair in text_tagged:
        word = tag_pair[0]
        tag = tag_map.get(tag_pair[-1])
        if tag is not None:
            key = (word, tag)
            value = lexicon_tag_score.get(key, 0)
            sent_score += value
    return sent_score


def complexity(c):
    """Score how complex the post is."""
    idx = np.nonzero(c)[0]
    if len(idx) == 0:
        return 0.0
    s = 0
    for i in idx:
        s += c[i] * (np.log(len(idx)) - np.log(1 + c[i]))
    return s / len(idx)


def num_sen(text):
    """How long is the post."""
    return len(st(text))


def num_word(text, c):
    """Number of word tokens in the post."""
    ns = num_sen(text)
    if ns:
        return np.sum(c) / num_sen(text)
    else:
        return 0.0


def readability(text, c):
    """Score the post on readability."""
    idx = np.nonzero(c)[0]
    if len(idx) == 0:
        return 0.0
    h_word = 0
    for i in idx:
        if len(feature_names[i]) > 6:
            h_word += 1
    return 100 * float(h_word) / np.sum(c) + num_word(text, c)


def informative(text):
    """How informative is the the post."""
    tfidf_vector = tfidf_transformer.transform(count_vectorizer.transform([text
                                                             ])).toarray()[0]
    return np.sum(tfidf_vector)


def polarity(c):
    """What is the sentiment expressed by the post."""
    idx = np.nonzero(c)[0]
    score = 0
    for i in idx:
        if feature_names[i] in sn_words:
            score += float(sn.polarity_value(feature_names[i])) * c[i]
    return score


def count_url(s):
    """Number of external links in the post."""
    return len(re.findall(r"@url", s))


feature_list = []
total_len = len(text_final)
for count in range(total_len):
    cleaned_text = text_final[count]
    num_com = num_comments_final[count]
    if count % 100 == 0:
        print(count)
    cvector = count_vectorizer.transform([cleaned_text]).toarray()[0]
    thread_feature = []
    thread_feature.extend([
        complexity(cvector),
        readability(cleaned_text, cvector),
        num_sen(cleaned_text),
        num_word(cleaned_text, cvector),
        informative(cleaned_text),
        polarity(cvector),
        count_url(cleaned_text),
        get_hate_lexicon_score(cleaned_text), num_com
    ])
    feature_list.append(thread_feature)

In [None]:
with open(output_feature_filename, "w") as f:
    wr = csv.writer(f)
    wr.writerows(feature_list)