In [27]:
import pandas as pd
#import re

posts = pd.read_csv('../data/fb_election_data_28112017.csv', encoding = 'utf-8')
#candidates = pd.read_csv('../data/BTW17 Facebook.csv', encoding = 'utf-8')

# cleansing
posts = posts[posts['message'].notnull()]

In [28]:
posts_party = pd.read_csv('../data/party_data.csv', encoding = 'utf-8')

conditions = [
    (posts_party['from_name'] == 'AfD'),
    (posts_party['from_name'] == 'DIE LINKE'),
    (posts_party['from_name'] == 'BÜNDNIS 90/DIE GRÜNEN'),
    (posts_party['from_name'] == 'CSU (Christlich-Soziale Union)'),
    (posts_party['from_name'] == 'CDU'),
    (posts_party['from_name'] == 'FDP')]
     
choices = ['AfD Party', 'DIE LINKE Party', 'GRÜNE Party', 'CSU Party', 'CDU Party', 'FDP Party']

import numpy as np
posts_party['Partei_ABK'] = np.select(conditions, choices, default='SPD Party')


#posts_party.sample(50)
posts = posts.append(posts_party)

#posts[posts['from_name'] == 'AfD']

posts['message'] = str(posts['message'])

In [29]:
posts.shape

(177510, 34)

In [30]:
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.str.lower()
    # removes urls
    norm_text = norm_text.str.replace(r'^https?:\/\/.*[\r\n]*', '', n=-1)
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':', '\n']:
        norm_text = norm_text.str.replace(char, ' ' + char + ' ')
    # removes hashtags
    norm_text= norm_text.str.replace('#', '')
    return norm_text

# to do: remove hashtags. #fdp and fdp should count as the same word.

In [31]:
# applies normalization fct to posts
posts['message'] = normalize_text(posts['message'])

In [32]:
posts['message'].sample(10)

37147     1       gerade einmal 9 . 000 anträge auf die ...
183537    1       gerade einmal 9 . 000 anträge auf die ...
131447    1       gerade einmal 9 . 000 anträge auf die ...
103055    1       gerade einmal 9 . 000 anträge auf die ...
388       1       gerade einmal 9 . 000 anträge auf die ...
101824    1       gerade einmal 9 . 000 anträge auf die ...
135341    1       gerade einmal 9 . 000 anträge auf die ...
60238     1       gerade einmal 9 . 000 anträge auf die ...
3301      1       gerade einmal 9 . 000 anträge auf die ...
10343     1       gerade einmal 9 . 000 anträge auf die ...
Name: message, dtype: object

In [33]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

MessageDoc = namedtuple('MessageDoc', 'words tags split')

alldocs = []  # Will hold all docs in original order
for line_no, line in posts.iterrows():
    #import pdb; pdb.set_trace()
    message = line.message
    tokens = gensim.utils.to_unicode(message).split()
    words = tokens[1:]
    tags = [line_no, line.from_name, line.Partei_ABK] # 'tags = [tokens[0]]' would also work at extra memory cost
    split = ['train', 'test', 'extra', 'extra'][line_no//200000]  # 25k train, 25k test, 25k extra
    alldocs.append(MessageDoc(words, tags, split))
        
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

177510 docs: 177510 train-sentiment, 0 test-sentiment


In [34]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=5, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=5, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=5, workers=cores),
]

# Speed up setup by sharing results of the 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

#print(simple_models[0].wv.vocab.keys()[0:4])

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d100,n5,w5,mc5,s0.001,t4)
Doc2Vec(dbow,d100,n5,mc5,s0.001,t4)
Doc2Vec(dm/m,d100,n5,w10,mc5,s0.001,t4)


In [35]:
simple_models[0].wv.vocab.keys()

dict_keys(['gerade', 'einmal', '9', '.', '000', 'anträge', 'auf', 'die', 'kaufprämie', '2', '"', 'liberalen', 'stehen', 'für', 'einen', 'individualisti', '3', 'wir', 'sind', 'bereit', '2017', '-', 'machen', 'deutschl', '4', 'kann', 'ein', 'jahr', 'besser', 'beginnen', 'als', 'mit', 'einem', 'gu', '5', 'platz', '12', 'im', 'diesjährigen', 'landkreis-ranking', 'vo', '6', 'in', 'zeiten', ',', 'denen', 'welt', 'aus', 'den', 'fugen', 'zu', '7', 'fällt', 'euch', 'spontan', 'etwas', 'das', 'sich', '83%', '8', 'afd', 'missbraucht', 'sophie', 'scholl', 'widerstan', '10', 'legalisierung', 'von', 'cannabis', 'nur', '11', '–', 'anders', 'lässt', 'stimmu', 'es', 'ist', 'sonntagmittag', 'du', 'kannst', 'zuhause', 'beim', 'b', '13', 'extrablatt', '!', '\\n\\ndie', 'hohenloherzei', '15', 'warum', 'freiedemokraten', 'mehr', 'andere', 'an', '16', 'viele', 'grüße', 'heute', 'pferdemarkt', 'dörzbac', '17', 'wäre', 'nicht', 'so', 'verdammt', 'viel', 'wahres', 'dran', 'man', '18', 'geht', 'bach', 'runter', 

Le and Mikolov notes that combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) improves performance. We will follow, pairing the models together for evaluation. Here, we concatenate the paragraph vectors obtained from each model.

In [36]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [37]:
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

  from pandas.core import datetools


In [38]:
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [40]:
simple_models[1].train(doc_list, total_examples=len(doc_list), epochs=10, start_alpha = 0.025, end_alpha = 0.001)

818639423

In [48]:
simple_models[0].train(doc_list, total_examples=len(doc_list), epochs=10, start_alpha = 0.025, end_alpha = 0.001)
simple_models[2].train(doc_list, total_examples=len(doc_list), epochs=10, start_alpha = 0.025, end_alpha = 0.001)

818640913

In [None]:
from random import shuffle
import datetime

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

#alpha, min_alpha, passes = (0.025, 0.001, 50)
#alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())


for name, train_model in models_by_name.items():
    # Train
    duration = 'na'
    train_model.alpha, train_model.min_alpha = 0.025, 0.0001
    with elapsed_timer() as elapsed:
        logging.info('.. Train model')
        train_model.train(doc_list, total_examples=len(doc_list), epochs=10)
        duration = '%.1f' % elapsed()
        print(duration)
            
#print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
#alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2017-12-26 19:28:26.706122
32.2
1479.5


In [None]:
doc_id = np.random.randint(simple_models[0].docvecs.count)
print('for doc %d...' % doc_id)

most_similar_doc_ids = ()

for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
    doc_id_similar = (model.docvecs.most_similar([inferred_docvec], topn=1)[0])
    most_similar_doc_ids = most_similar_doc_ids + doc_id_similar[0]

In [202]:
simple_models[0].docvecs.doctags

{'Valentin Abel': Doctag(offset=0, word_count=4389, doc_count=93),
 'FDP': Doctag(offset=1, word_count=874602, doc_count=21293),
 'Dr. Michael von Abercron': Doctag(offset=2, word_count=2185, doc_count=75),
 'CDU': Doctag(offset=3, word_count=1951936, doc_count=39242),
 'Grigorios Aggelidis': Doctag(offset=4, word_count=2790, doc_count=47),
 'Diyar Agu': Doctag(offset=5, word_count=5009, doc_count=54),
 'DIE LINKE': Doctag(offset=6, word_count=1175047, doc_count=16024),
 'Gökay Akbulut DIE LINKE': Doctag(offset=7, word_count=942, doc_count=15),
 'Rolf Albach': Doctag(offset=8, word_count=4234, doc_count=122),
 'Stephan Albani MdB': Doctag(offset=9, word_count=38352, doc_count=516),
 'Katrin Albsteiger': Doctag(offset=10, word_count=4693, doc_count=95),
 'CSU': Doctag(offset=11, word_count=328147, doc_count=7627),
 'Daniel Alff': Doctag(offset=12, word_count=879, doc_count=16),
 'SPD': Doctag(offset=13, word_count=2767848, doc_count=58986),
 'Renata Alt': Doctag(offset=14, word_count=68

In [195]:
test = model.infer_vector(alldocs[1].words)
test
simple_models[0].wv.most_similar([test], topn=3)[0][0]

'jägerschaft'

In [49]:
simple_models[0].docvecs.most_similar([simple_models[0].docvecs['FDP']])

[('FDP', 0.9999999403953552),
 (153338, 0.5073044896125793),
 (62574, 0.4324100911617279),
 (1763, 0.41831353306770325),
 (101428, 0.4064629375934601),
 (98805, 0.40588831901550293),
 (18808, 0.40113359689712524),
 (78090, 0.3990175127983093),
 (150253, 0.38809144496917725),
 (141652, 0.3870908319950104)]

In [56]:
for party in ['SPD Party', 'CDU Party', 'DIE LINKE Party', 'AfD Party', 'CSU Party', 'GRÜNE Party', 'FDP Party']:
    sim = simple_models[1].docvecs.similarity('Cem Özdemir', party)
    print('Similarity', sim, 'with', party)

Similarity 0.460102853564 with SPD Party
Similarity 0.526825530011 with CDU Party
Similarity 0.330052668587 with DIE LINKE Party
Similarity 0.576673563388 with AfD Party
Similarity 0.266448052596 with CSU Party
Similarity 0.358092376017 with GRÜNE Party
Similarity 0.481324476382 with FDP Party


In [209]:
simple_models[0].docvecs['FDP']

array([  1.56814302e-03,   4.24118480e-03,  -4.90804575e-03,
         1.32245745e-03,  -1.38298399e-03,  -3.26056872e-03,
         5.83835703e-04,   1.39327825e-03,   6.31585252e-04,
         2.14937562e-03,   3.07568698e-03,   2.83767050e-03,
         6.94432238e-04,  -1.44751905e-03,  -2.36905972e-03,
        -2.74715433e-03,  -2.47987220e-03,   2.51617003e-03,
        -4.63978533e-04,   4.18596994e-03,  -2.86688632e-03,
        -1.56258070e-03,  -3.26186768e-03,  -1.29306444e-03,
         4.88492474e-03,   1.94621796e-03,   1.88163726e-03,
         1.86278543e-03,   2.57399236e-03,   4.94546397e-03,
         3.91571317e-03,  -2.97088700e-04,   1.99098559e-03,
         1.41985714e-03,   1.19426717e-04,   5.39211265e-04,
        -3.05896648e-03,  -4.95848758e-03,  -3.69621278e-03,
        -4.52898256e-03,   3.68376868e-03,  -3.38195311e-03,
        -3.30458628e-03,  -7.62081065e-04,   1.04186265e-03,
         3.76356230e-03,  -3.64764337e-03,   3.20070377e-03,
        -2.40207091e-03,

Missing:
- words similiar/associated with label
    - politican
    - party
- label most similar to 
    - word or 
    - sentence
    
https://github.com/RaRe-Technologies/gensim/issues/1397
    
Open:
- rewriting training that it does not loop; searching for best handling of doc2vec training
https://stackoverflow.com/questions/47890052/improving-gensim-doc2vec-results
https://groups.google.com/forum/#!topic/gensim/2iT8p8m-wS0

For best practice and further ideas:
- https://stackoverflow.com/questions/tagged/doc2vec (last questions tagged with doc2vec)
- Clustering?
- Example for infer_vector (sentences, words)
- n_similarity for two docsets since it contains more than one doc
- visualisation:
    - https://jlorince.github.io/viz-tutorial/
- pretrained word vectors?

In [None]:
for party in ['SPD', 'CDU', 'DIE LINKE', 'AfD', 'CSU', 'GRÜNE', 'FDP']:
    sim = simple_models[2].docvecs.n_similarity('Martin Schulz', party)
    print('Similarity', sim, 'with', party)