In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import logging

FORMAT = '%(asctime)s %(levelname)s %(message)s'
DATEFORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.WARNING,
                    format=FORMAT,
                    datefmt=DATEFORMAT)



In [11]:
data = pd.read_pickle('data_clean_4cols.pickle')

In [15]:
candidate_data = pd.DataFrame()
candidate_data = (data
                  .drop(['id', 'message'], axis=1)
                  .drop_duplicates('from_name')
                  .set_index('from_name'))

In [33]:
# join all messages by the same candidate
candidate_data['messages'] = data.groupby('from_name')['message'].apply(' '.join)

In [None]:
#now create a list that contains the name of all the text file in your data #folder
docLabels = []
docLabels = [f for f in listdir(“PATH TO YOU DOCUMENT FOLDER”) if 
 f.endswith(‘.txt’)]
#create a list data that stores the content of all text files in order of their names in docLabels
data = []
for doc in docLabels:
  data.append(open(‘PATH TO YOU DOCUMENT FOLDER’ + doc).read())

In [19]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('german'))
#This function does all cleaning of data using two objects above
def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        dlist = list(set(dlist).difference(stopword_set))
        new_data.append(dlist)
    return new_data

In [23]:
sample = candidate_data.sample(n=100)

In [25]:
cleaned = nlp_clean(sample['messages'])

In [26]:
cleaned[0]

['interview',
 'hart',
 'aobpt',
 'redaktionelle',
 'fdp',
 'zeit',
 'bescheiden',
 'gmbh',
 'gebracht',
 'gemacht',
 'nächsten',
 'wirtschafts',
 's',
 'https',
 'sache',
 'hase',
 'renommiertesten',
 'tv',
 'liberalismus',
 'rechtsstaat',
 'neue',
 'bundestagswahlkampf',
 'fokus',
 'nachbereitung',
 'vorangeht',
 'trendwenden',
 'lamentieren',
 'eingeladen',
 'zeilen',
 'bundes',
 'unmittelbar',
 'robert',
 'bildung',
 'freitag',
 'sollen',
 'zumindest',
 'manager',
 'dabei',
 'investieren',
 'sicher',
 'verbotspredigern',
 'zurück',
 'biesok',
 'meinung',
 'zurückgeben',
 'industrie',
 'wichtig',
 'fraktion',
 'julis',
 'einsetzten',
 'bürgern',
 'heute',
 'debatte',
 'wem',
 'informiert',
 'interessierten',
 'planerisch',
 'umsetzung',
 'bestehenden',
 'pointiert',
 'diskutieren',
 'meilenstein',
 'europaparlaments',
 'steuern',
 'hervorragend',
 'vormittäglichen',
 'nehmen',
 'darf',
 'kommentare',
 'mitglied',
 'freidemokraten',
 'regionalen',
 'veranstaltung',
 'gern',
 'wohin',

In [31]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = list(labels_list)
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.LabeledSentence(doc, [self.labels_list[idx]])

In [34]:
#iterator returned over all documents
labeled_docs = LabeledLineSentence(candidate_data['messages'], candidate_data.index)

In [51]:
model = gensim.models.Doc2Vec(size=300, min_count=0, alpha=0.025, min_alpha=0.025)
model.build_vocab(labeled_docs)
#training of model
for epoch in range(2):
    logger.info('iteration ' + str(epoch + 1))
    model.train(labeled_docs, total_examples=model.corpus_count, epochs=model.iter)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    model.train(labeled_docs, total_examples=model.corpus_count, epochs=model.iter)
#saving the created model
model.save('doc2vec.model')
logger.info('model saved')

2017-12-10 23:59:35 INFO collecting all words and their counts
2017-12-10 23:59:35 INFO PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-12-10 23:59:47 INFO collected 1279 word types and 1001 unique tags from a corpus of 1001 examples and 58322224 words
2017-12-10 23:59:47 INFO Loading a fresh vocabulary
2017-12-10 23:59:47 INFO min_count=0 retains 1279 unique words (100% of original 1279, drops 0)
2017-12-10 23:59:47 INFO min_count=0 leaves 58322224 word corpus (100% of original 58322224, drops 0)
2017-12-10 23:59:47 INFO deleting the raw counts dictionary of 1279 items
2017-12-10 23:59:47 INFO sample=0.001 downsamples 39 most-common words
2017-12-10 23:59:47 INFO downsampling leaves estimated 14675888 word corpus (25.2% of prior 58322224)
2017-12-10 23:59:47 INFO estimated required memory for 1279 words and 300 dimensions: 5110500 bytes
2017-12-10 23:59:47 INFO resetting layer weights
2017-12-10 23:59:47 INFO iteration 1
2017-12-10 23:59:47 INFO training mo

2017-12-11 00:01:03 INFO PROGRESS: at 80.26% examples, 401704 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:01:04 INFO PROGRESS: at 81.32% examples, 401959 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:01:05 INFO PROGRESS: at 82.42% examples, 401760 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:06 INFO PROGRESS: at 83.52% examples, 401414 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:07 INFO PROGRESS: at 84.40% examples, 400042 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:08 INFO PROGRESS: at 85.41% examples, 399431 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:01:09 INFO PROGRESS: at 86.29% examples, 398497 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:10 INFO PROGRESS: at 86.93% examples, 396513 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:01:11 INFO PROGRESS: at 87.95% examples, 396352 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:12 INFO PROGRESS: at 88.91% examples, 395847 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:01:13 INFO PROGRESS: at 89.97% examp

2017-12-11 00:02:27 INFO PROGRESS: at 66.91% examples, 401643 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:28 INFO PROGRESS: at 67.69% examples, 399638 words/s, in_qsize 4, out_qsize 1
2017-12-11 00:02:29 INFO PROGRESS: at 68.61% examples, 398719 words/s, in_qsize 4, out_qsize 1
2017-12-11 00:02:30 INFO PROGRESS: at 69.63% examples, 398443 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:31 INFO PROGRESS: at 70.53% examples, 397605 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:32 INFO PROGRESS: at 71.33% examples, 395796 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:33 INFO PROGRESS: at 72.01% examples, 393520 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:02:34 INFO PROGRESS: at 72.91% examples, 392987 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:35 INFO PROGRESS: at 73.79% examples, 392414 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:02:36 INFO PROGRESS: at 74.75% examples, 392539 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:02:37 INFO PROGRESS: at 75.78% examp

2017-12-11 00:03:51 INFO PROGRESS: at 46.47% examples, 412083 words/s, in_qsize 6, out_qsize 2
2017-12-11 00:03:52 INFO PROGRESS: at 47.35% examples, 410302 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:03:53 INFO PROGRESS: at 48.61% examples, 411802 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:03:54 INFO PROGRESS: at 49.31% examples, 408538 words/s, in_qsize 6, out_qsize 1
2017-12-11 00:03:55 INFO PROGRESS: at 50.09% examples, 406009 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:03:56 INFO PROGRESS: at 51.43% examples, 407543 words/s, in_qsize 6, out_qsize 1
2017-12-11 00:03:57 INFO PROGRESS: at 52.49% examples, 407810 words/s, in_qsize 6, out_qsize 1
2017-12-11 00:03:58 INFO PROGRESS: at 53.57% examples, 408624 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:03:59 INFO PROGRESS: at 54.79% examples, 410595 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:04:00 INFO PROGRESS: at 55.52% examples, 407939 words/s, in_qsize 6, out_qsize 1
2017-12-11 00:04:01 INFO PROGRESS: at 56.58% examp

2017-12-11 00:05:13 INFO PROGRESS: at 33.23% examples, 352816 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:14 INFO PROGRESS: at 34.21% examples, 354154 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:05:15 INFO PROGRESS: at 35.24% examples, 355377 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:16 INFO PROGRESS: at 36.28% examples, 355810 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:17 INFO PROGRESS: at 37.00% examples, 354165 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:05:18 INFO PROGRESS: at 37.86% examples, 354166 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:19 INFO PROGRESS: at 38.48% examples, 351656 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:05:20 INFO PROGRESS: at 39.52% examples, 351733 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:21 INFO PROGRESS: at 40.44% examples, 351249 words/s, in_qsize 6, out_qsize 0
2017-12-11 00:05:22 INFO PROGRESS: at 41.26% examples, 350440 words/s, in_qsize 5, out_qsize 0
2017-12-11 00:05:23 INFO PROGRESS: at 42.00% examp