In [None]:
import pickle
from os import path

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from utils import read_file, preprocess, stem

from lda import guidedlda

In [None]:
# Load processed data: raw -> cleaning -> stemming
processed_datafile = 'processed.pkl'
if path.exists(processed_datafile):
    with open(processed_datafile, 'rb') as f:
        data, y = pickle.load(f)
else:
    data, y = read_file('../hatespeech', True)
    data = [preprocess(text) for text in data]
    with open('processed.pkl', 'wb') as f:
        pickle.dump((data, y), f)

In [None]:
# get count features
count_vectorizer = CountVectorizer(input='content', encoding='ascii',
                                   decode_error='ignore',
                                   strip_accents='ascii',
                                   stop_words='english', min_df=2)
count_weights = count_vectorizer.fit_transform(data)
vocabulary = count_vectorizer.vocabulary_
word2id = dict((v, idx) for idx, v in enumerate(vocabulary))

#Convert dataset to Document Term Matrix
TermCountsDoc = count_vectorizer.fit_transform(data)   
Terms = np.array(count_vectorizer.vocabulary_.keys())

In [None]:
#keywords with which to guide the LDA
keywords = [ ['time', 'love', 'good', 'great', 'happy'], 
           ['new', 'free', 'video', 'check', 'win'], 
           ['fucked', 'bitch', 'pussy', 'ass', 'ugly'], 
           ['hate', 'racist', 'muslims', 'retarded', 'isis'] ]


class_label = 0
keyword_indices = {}
for i in keywords:
    keyword_indices[class_label] = [stem(w) for w in i]
    class_label += 1

In [None]:
#initialize model
model = guidedlda.GuidedLDA(n_topics=4, n_iter=50, random_state=7, refresh=20)

In [None]:
seed_topics = {}
for t_id, st in enumerate(keywords):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [None]:
#guided lda
model.fit(TermCountsDoc, seed_topics=seed_topics, seed_confidence=0.2)

#normal lda
# model.fit(TermCountsDoc)

In [None]:
#document topic probability distributions
doc_topic = model.transform(TermCountsDoc)

In [None]:
#assign label with max probability
y_pred = np.argmax(doc_topic, axis=1)

In [None]:
# print classification report
print(classification_report(y, y_pred))