In [144]:
import pandas as pd
import numpy as np
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import logging
import multiprocessing
import os
from collections import namedtuple

FORMAT = '%(asctime)s %(levelname)s %(message)s'
DATEFORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.WARNING,
                    format=FORMAT,
                    datefmt=DATEFORMAT)
logger = logging.getLogger(__name__)

In [145]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(parent_dir, 'data')
models_dir = os.path.join(parent_dir, 'models')
print('working directory: ', os.getcwd())
print('data directory:    ', data_dir, )
print('models directory:  ', models_dir)

working directory:  /Volumes/Datahouse/Users/Stipe/Documents/Studium/Master VWL/5 WS 2017/Seminar Information Systems/InformationSystemsWS1718/notebooks
data directory:     /Volumes/Datahouse/Users/Stipe/Documents/Studium/Master VWL/5 WS 2017/Seminar Information Systems/InformationSystemsWS1718/data
models directory:   /Volumes/Datahouse/Users/Stipe/Documents/Studium/Master VWL/5 WS 2017/Seminar Information Systems/InformationSystemsWS1718/models


In [146]:
data = pd.read_pickle(os.path.join(data_dir, 'data_clean_4cols.pickle'))
data

Unnamed: 0,id,from_name,message,Partei_ABK
0,1586699104972808_1625886087720776,Valentin Abel,Gerade einmal 9.000 Anträge auf die Kaufprämie...,FDP
1,1586699104972808_1626084567700928,Valentin Abel,"""Die Liberalen stehen für einen individualisti...",FDP
2,1586699104972808_1627233684252683,Valentin Abel,Wir sind bereit für 2017 - machen wir Deutschl...,FDP
3,1586699104972808_1627428260899892,Valentin Abel,Kann ein Jahr besser beginnen als mit einem gu...,FDP
4,1586699104972808_1628833480759370,Valentin Abel,Platz 12 im diesjährigen #Landkreis-Ranking vo...,FDP
5,1586699104972808_1629446607364724,Valentin Abel,"In Zeiten, in denen die Welt aus den Fugen zu ...",FDP
6,1586699104972808_1629559267353458,Valentin Abel,"Fällt euch spontan etwas ein, auf das sich 83%...",FDP
7,1586699104972808_1630578820584836,Valentin Abel,"Die #AfD missbraucht Sophie #Scholl, Widerstan...",FDP
8,1586699104972808_1632129530429765,Valentin Abel,Die #Legalisierung von #Cannabis kann nur ein ...,FDP
9,1586699104972808_1636145450028173,Valentin Abel,Bereit für 2017 – anders lässt sich die Stimmu...,FDP


In [147]:
candidate_data = (data.drop(['id', 'message'], axis=1)
                      .drop_duplicates('from_name')
#                      .set_index('from_name')
                 )
candidate_data.tail(10)

Unnamed: 0,from_name,Partei_ABK
173042,Dr. Daniela De Ridder,SPD
173559,Björn Simon,CDU
173902,Waldemar Westermayer,CDU
173963,AfD Party,AfD
174481,CDU Party,CDU
174974,SPD Party,SPD
175488,CSU Party,CSU
176078,GRÜNE Party,GRÜNE
176425,FDP Party,FDP
176969,DIE LINKE Party,DIE LINKE


In [150]:
for char in ['SPD', 'spd', 'FDP', 'fdp', 'CDU' 'cdu', 'AfD' 'afd', 'AFD', 'Grüne', 'GRÜNE', 'Die Grünen', 'GRÜNEN', 'Linke', 'LINKE', 'CSU', 'csu', 'Die Linke', 'DIE LINKE',]:
        data.message = data.message.str.replace(char, '')

In [162]:
any(data.message.str.count('FDP')>0)

True

In [163]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('german'))
MessageDoc = namedtuple('MessageDoc', 'words tags split')
alldocs = []  # Will hold all doacs in original order
for line_no, line in data.iterrows():
    #import pdb; pdb.set_trace()
    message = line.message.lower()
    words = tokenizer.tokenize(message)
    tags = [str(line_no), line['from_name'], line['Partei_ABK']] # line_no needs to be converted as string to be included in tags 
    split = ['train', 'test', 'extra', 'extra'][line_no//200000]  # 25k train, 25k test, 25k extra
    alldocs.append(MessageDoc(words, tags, split))

In [164]:
tags

['177306', 'DIE LINKE Party', 'DIE LINKE']

In [165]:
len(alldocs)

177307

In [166]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [167]:
model = gensim.models.Doc2Vec(dm=0, size=100, min_count=5)
model.build_vocab(alldocs)
model.train(alldocs, total_examples=model.corpus_count, epochs=2, start_alpha=0.025, end_alpha=0.001)

14211384

In [168]:
# saving the model doesn't work. Why?
#model.save(os.path.join(models_dir, 'doc2vec_c.model'))
#logger.info('model saved')

In [169]:
word_vecs = model.wv
doc_vecs = model.docvecs

In [170]:
#doc_vecs.doctags['FDP']
#word_vecs.vocab

In [171]:
candidate_data.tail(10)

Unnamed: 0,from_name,Partei_ABK
173042,Dr. Daniela De Ridder,SPD
173559,Björn Simon,CDU
173902,Waldemar Westermayer,CDU
173963,AfD Party,AfD
174481,CDU Party,CDU
174974,SPD Party,SPD
175488,CSU Party,CSU
176078,GRÜNE Party,GRÜNE
176425,FDP Party,FDP
176969,DIE LINKE Party,DIE LINKE


In [172]:
# calculate similarity for all candidates and parties
for party in ['SPD Party', 'CDU Party', 'DIE LINKE Party', 'AfD Party', 'CSU Party', 'GRÜNE Party', 'FDP Party']:
    candidate_data[party] = candidate_data['from_name'].map(lambda candidate: model.docvecs.similarity(candidate, party))
    
# make a new column holding which party is most similar
candidate_data['most similar'] = candidate_data.loc[:,'SPD Party':].idxmax(axis=1)
candidate_data.head()

Unnamed: 0,from_name,Partei_ABK,SPD Party,CDU Party,DIE LINKE Party,AfD Party,CSU Party,GRÜNE Party,FDP Party,most similar
0,Valentin Abel,FDP,0.396917,0.381115,0.341265,0.156636,0.37012,0.448217,0.77501,FDP Party
93,Dr. Michael von Abercron,CDU,0.490198,0.743707,0.401907,0.242113,0.394536,0.437399,0.424381,CDU Party
168,Grigorios Aggelidis,FDP,0.441186,0.411392,0.415657,0.315556,0.390019,0.429746,0.644765,FDP Party
215,Diyar Agu,DIE LINKE,0.469892,0.544633,0.782156,0.221293,0.25582,0.405342,0.452726,DIE LINKE Party
269,Gökay Akbulut DIE LINKE,DIE LINKE,0.560917,0.473694,0.671338,0.341996,0.498135,0.646848,0.460541,DIE LINKE Party


In [173]:
candidate_data.shape

(1008, 10)

In [174]:
most_similar_candidates = pd.crosstab(candidate_data['Partei_ABK'], candidate_data['most similar'])

In [175]:
most_similar_candidates

most similar,AfD Party,CDU Party,CSU Party,DIE LINKE Party,FDP Party,GRÜNE Party,SPD Party
Partei_ABK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AfD,122,0,1,0,0,1,0
CDU,1,184,6,2,2,8,6
CSU,0,0,46,0,0,0,0
DIE LINKE,0,0,0,110,0,0,1
FDP,2,1,0,5,158,4,3
GRÜNE,1,0,0,7,0,86,1
SPD,0,2,19,10,0,9,210


In [176]:
#sums = pd.crosstab(candidate_data['Partei_ABK'], candidate_data['most similar'])

In [177]:
model.docvecs.count

178322

In [178]:
# filter out the candidate data
mask = [tag in candidate_data['from_name'].values for tag in model.docvecs.offset2doctag]
candidate_vecs = model.docvecs.doctag_syn0[mask]

In [179]:
len(candidate_vecs)

1008

In [180]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X = candidate_vecs
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

[Colors in Matplotlib](https://matplotlib.org/api/colors_api.html)

[Color names from xkcd](https://xkcd.com/color/rgb/)

In [181]:
party_colors = {'AfD': 'rgb(0, 0, 51)',
                'DIE LINKE': 'rgb(204, 0, 102)',
                'GRÜNE': 'rgb(0, 153, 0)',
                'CSU': 'rgb(102, 178, 255)',
                'CDU': 'rgb(0, 0, 0)',
                'FDP': 'rgb(255, 255, 51)',
                'SPD': 'rgb(255, 0, 0)'}
candidate_data['color'] = candidate_data['Partei_ABK'].map(party_colors)

In [182]:
#plt.figure(num=None, figsize=(10, 8))  # set the figure size
#plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=candidate_data['color'])
#plt.show()

In [183]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

In [184]:
trace = go.Scatter(x=X_tsne[:, 0], y=X_tsne[:, 1],
                   mode='markers', 
                   marker=dict(color=candidate_data['color'], 
                               colorscale=cmap,
                               showscale=False,
                               line=dict(color='black', width=1)),
                  text=candidate_data['from_name'])
#data.append(trace)
#titles.append("t-SNE (%.2g sec)" % (t1 - t0))

In [185]:
layout=dict(margin=dict(l=10, r=10,
                        t=30, b=10)
           )

fig = go.Figure(data=[trace], layout=layout)

In [186]:
py.iplot(fig)