In [2]:
import pandas as pd
import numpy as np
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import logging
import multiprocessing
import os
from collections import namedtuple

FORMAT = '%(asctime)s %(levelname)s %(message)s'
DATEFORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.WARNING,
                    format=FORMAT,
                    datefmt=DATEFORMAT)
logger = logging.getLogger(__name__)



In [3]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(parent_dir, 'data')
models_dir = os.path.join(parent_dir, 'models')
print('working directory: ', os.getcwd())
print('data directory:    ', data_dir, )
print('models directory:  ', models_dir)

working directory:  C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\notebooks
data directory:     C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\data
models directory:   C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\models


In [4]:
data = pd.read_pickle(os.path.join(data_dir, 'data_clean_4cols.pickle'))
data.head()

Unnamed: 0,id,from_name,message,Partei_ABK
1,1586699104972808_1625886087720776,Valentin Abel,Gerade einmal 9.000 Anträge auf die Kaufprämie...,FDP
2,1586699104972808_1626084567700928,Valentin Abel,"""Die Liberalen stehen für einen individualisti...",FDP
3,1586699104972808_1627233684252683,Valentin Abel,Wir sind bereit für 2017 - machen wir Deutschl...,FDP
4,1586699104972808_1627428260899892,Valentin Abel,Kann ein Jahr besser beginnen als mit einem gu...,FDP
5,1586699104972808_1628833480759370,Valentin Abel,Platz 12 im diesjährigen #Landkreis-Ranking vo...,FDP


In [6]:
candidate_data = (data.drop(['id', 'message'], axis=1)
                      .drop_duplicates('from_name')
                      .set_index('from_name'))
candidate_data.head()

Unnamed: 0_level_0,Partei_ABK
from_name,Unnamed: 1_level_1
Valentin Abel,FDP
Dr. Michael von Abercron,CDU
Grigorios Aggelidis,FDP
Diyar Agu,DIE LINKE
Gökay Akbulut DIE LINKE,DIE LINKE


In [7]:
sample = data#.sample(n=10000)

In [8]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('german'))
MessageDoc = namedtuple('MessageDoc', 'words tags split')
alldocs = []  # Will hold all docs in original order
for line_no, line in sample.iterrows():
    #import pdb; pdb.set_trace()
    message = line.message.lower()
    words = tokenizer.tokenize(message)
    tags = [line_no, line.from_name, line.Partei_ABK] # 'tags = [tokens[0]]' would also work at extra memory cost
    split = ['train', 'test', 'extra', 'extra'][line_no//200000]  # 25k train, 25k test, 25k extra
    alldocs.append(MessageDoc(words, tags, split))

In [9]:
len(alldocs)

177307

In [10]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [11]:
model = gensim.models.Doc2Vec(dm=0, size=100, min_count=5)
model.build_vocab(alldocs)
model.train(alldocs, total_examples=model.corpus_count, epochs=2, start_alpha=0.025, end_alpha=0.001)

14314662

In [None]:
model.save(os.path.join(models_dir, 'doc2vec_c.model'))
logger.info('model saved')
# why does this not work?

In [13]:
word_vecs = model.wv
doc_vecs = model.docvecs

In [17]:
for party in ['SPD Party', 'CDU Party', 'DIE LINKE Party', 'AfD Party', 'CSU Party', 'GRÜNE Party', 'FDP Party']:
    candidate_data[party] = candidate_data.index.map(lambda candidate: model.docvecs.similarity(candidate, party))
    #sim = model.docvecs.similarity(candidate, party)

In [24]:
candidate_data.head()

Unnamed: 0_level_0,Partei_ABK,SPD Party,CDU Party,DIE LINKE Party,AfD Party,CSU Party,GRÜNE Party,FDP Party,most similar
from_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Valentin Abel,FDP,0.374163,0.366248,0.377242,0.226058,0.384117,0.488097,0.813412,FDP Party
Dr. Michael von Abercron,CDU,0.490099,0.722058,0.373639,0.245865,0.387747,0.457792,0.418815,CDU Party
Grigorios Aggelidis,FDP,0.348931,0.356978,0.351473,0.270891,0.288008,0.400288,0.530926,FDP Party
Diyar Agu,DIE LINKE,0.43967,0.435868,0.792577,0.199063,0.142829,0.38199,0.403513,DIE LINKE Party
Gökay Akbulut DIE LINKE,DIE LINKE,0.537304,0.47593,0.679563,0.292301,0.461141,0.631102,0.444224,DIE LINKE Party


In [23]:
candidate_data['most similar'] = candidate_data.iloc[:,1:].idxmax(axis=1)

In [25]:
most_similar_candidates = pd.crosstab(candidate_data['Partei_ABK'], candidate_data['most similar'])

In [26]:
most_similar_candidates

most similar,AfD Party,CDU Party,CSU Party,DIE LINKE Party,FDP Party,GRÜNE Party,SPD Party
Partei_ABK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AfD,119,0,1,1,1,1,1
CDU,2,185,7,2,1,8,4
CSU,0,1,45,0,0,0,0
DIE LINKE,1,0,0,110,0,0,0
FDP,2,0,0,1,164,6,0
GRÜNE,0,0,0,1,0,93,1
SPD,0,3,15,11,2,8,211
