# Set up

In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import logging
import multiprocessing
import os
from collections import namedtuple
import seaborn as sns

FORMAT = '%(asctime)s %(levelname)s %(message)s'
DATEFORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.WARNING,
                    format=FORMAT,
                    datefmt=DATEFORMAT)
logger = logging.getLogger(__name__)



In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(parent_dir, 'data')
models_dir = os.path.join(parent_dir, 'models')
print('working directory: ', os.getcwd())
print('data directory:    ', data_dir, )
print('models directory:  ', models_dir)

working directory:  C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\notebooks
data directory:     C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\data
models directory:   C:\Users\Joni\Dropbox (Privat)\CODE\InformationSystemsWS1718\models


# Data preparation

In [None]:
data = pd.read_pickle(os.path.join(data_dir, 'data_clean_4cols.pickle'))
data

In [4]:
candidate_data = (data.drop(['id', 'message'], axis=1)
                      .drop_duplicates('from_name')
#                      .set_index('from_name')
                 )
candidate_data.tail(10)

Unnamed: 0,from_name,Partei_ABK
173042,Dr. Daniela De Ridder,SPD
173559,Björn Simon,CDU
173902,Waldemar Westermayer,CDU
173963,AfD Party,AfD
174481,CDU Party,CDU
174974,SPD Party,SPD
175488,CSU Party,CSU
176078,GRÜNE Party,GRÜNE
176425,FDP Party,FDP
176969,DIE LINKE Party,DIE LINKE


In [5]:
for char in ['SPD', 'spd', 'FDP', 'fdp', 'CDU' 'cdu', 'AfD' 'afd', 'AFD', 'Grüne', 'GRÜNE', 'Die Grünen', 'GRÜNEN', 'Linke', 'LINKE', 'CSU', 'csu', 'Die Linke', 'DIE LINKE',]:
        data.message = data.message.str.replace(char, '')

In [6]:
any(data.message.str.count('FDP')>0)

False

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('german'))
MessageDoc = namedtuple('MessageDoc', 'words tags split')
alldocs = []  # Will hold all doacs in original order
for line_no, line in data.iterrows():
    #import pdb; pdb.set_trace()
    message = line.message.lower()
    words = tokenizer.tokenize(message)
    tags = [str(line_no), line['from_name'], line['Partei_ABK']] # line_no needs to be converted as string to be included in tags 
    split = ['train', 'test', 'extra', 'extra'][line_no//200000]  # 25k train, 25k test, 25k extra
    alldocs.append(MessageDoc(words, tags, split))

In [9]:
len(alldocs)

177307

# Model training

In [10]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [11]:
model = gensim.models.Doc2Vec(dm=0, size=100, min_count=5)
model.build_vocab(alldocs)
model.train(alldocs, total_examples=model.corpus_count, epochs=2, start_alpha=0.025, end_alpha=0.001)

14213685

In [12]:
# saving the model doesn't work. Why?
#model.save(os.path.join(models_dir, 'doc2vec_c.model'))
#logger.info('model saved')

# Assessing similarity between candidate and parties

In [13]:
word_vecs = model.wv
doc_vecs = model.docvecs

In [14]:
#doc_vecs.doctags['FDP']
#word_vecs.vocab

In [15]:
candidate_data.tail(10)

Unnamed: 0,from_name,Partei_ABK
173042,Dr. Daniela De Ridder,SPD
173559,Björn Simon,CDU
173902,Waldemar Westermayer,CDU
173963,AfD Party,AfD
174481,CDU Party,CDU
174974,SPD Party,SPD
175488,CSU Party,CSU
176078,GRÜNE Party,GRÜNE
176425,FDP Party,FDP
176969,DIE LINKE Party,DIE LINKE


In [16]:
# calculate similarity for all candidates and parties
for party in ['SPD Party', 'CDU Party', 'DIE LINKE Party', 'AfD Party', 'CSU Party', 'GRÜNE Party', 'FDP Party']:
    candidate_data[party] = candidate_data['from_name'].map(lambda candidate: model.docvecs.similarity(candidate, party))
    
# make a new column holding which party is most similar
candidate_data['most similar'] = candidate_data.loc[:,'SPD Party':].idxmax(axis=1)
candidate_data.head()

Unnamed: 0,from_name,Partei_ABK,SPD Party,CDU Party,DIE LINKE Party,AfD Party,CSU Party,GRÜNE Party,FDP Party,most similar
0,Valentin Abel,FDP,0.378756,0.359881,0.350436,0.213246,0.391339,0.451685,0.810069,FDP Party
93,Dr. Michael von Abercron,CDU,0.454485,0.712649,0.353469,0.30043,0.412321,0.429839,0.404117,CDU Party
168,Grigorios Aggelidis,FDP,0.43042,0.373238,0.376819,0.307115,0.386823,0.418811,0.623445,FDP Party
215,Diyar Agu,DIE LINKE,0.416359,0.490433,0.785106,0.235333,0.217756,0.375802,0.477791,DIE LINKE Party
269,Gökay Akbulut DIE LINKE,DIE LINKE,0.55421,0.439245,0.662877,0.322512,0.43006,0.615543,0.49411,DIE LINKE Party


In [17]:
candidate_data.shape

(1008, 10)

In [18]:
most_similar_candidates = pd.crosstab(candidate_data['Partei_ABK'], candidate_data['most similar'])

In [19]:
most_similar_candidates

most similar,AfD Party,CDU Party,CSU Party,DIE LINKE Party,FDP Party,GRÜNE Party,SPD Party
Partei_ABK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AfD,121,0,1,1,0,1,0
CDU,5,179,9,4,2,6,4
CSU,0,0,46,0,0,0,0
DIE LINKE,0,0,0,111,0,0,0
FDP,3,0,1,3,162,3,1
GRÜNE,0,0,0,6,1,87,1
SPD,2,3,18,13,1,7,206


In [59]:
# calculate average similarity of party candidates
candidate_data2 = (candidate_data
                   .set_index(['Partei_ABK', 'from_name'])
                   .drop(columns='most similar')
                   .rename_axis('party_similarity', axis='columns')
                   .sort_index()
                  )

candidate_data3 = candidate_data2.stack().reset_index()

In [None]:
candidate_data2.head()

In [None]:
candidate_data3.head()

In [100]:
average_similarity = pd.pivot_table(data=candidate_data3,
                                    index='Partei_ABK',
                                    columns='party_similarity',
                                    values=0,
                                    aggfunc='mean'
                                    ).round(decimals=2)

In [99]:
import seaborn as sns
# inserting columns of zeros and ones to align the styling in the next step
# (otherwise, the colours would not mean the same in each row)
average_similarity_styled = average_similarity.copy()
average_similarity_styled['0'] = 0.0
average_similarity_styled['1'] = 1.0
cm = sns.light_palette("blue", as_cmap=True)
average_similarity_styled = average_similarity_styled.style.background_gradient(cmap=cm, axis=1)
average_similarity_styled

party_similarity,AfD Party,CDU Party,CSU Party,DIE LINKE Party,FDP Party,GRÜNE Party,SPD Party,0,1
Partei_ABK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AfD,0.65,0.31,0.37,0.35,0.32,0.38,0.33,0,1
CDU,0.24,0.49,0.33,0.23,0.27,0.3,0.3,0,1
CSU,0.26,0.37,0.73,0.26,0.33,0.32,0.29,0,1
DIE LINKE,0.26,0.27,0.24,0.7,0.31,0.39,0.37,0,1
FDP,0.3,0.4,0.39,0.4,0.68,0.4,0.38,0,1
GRÜNE,0.24,0.3,0.31,0.47,0.34,0.68,0.38,0,1
SPD,0.22,0.29,0.28,0.34,0.25,0.34,0.52,0,1


In [96]:
# computing the mean of diagonal elements
average_diagonal = np.trace(average_similarity) / 7
average_diagonal

0.6357142857142859

In [97]:
# computing the mean of off-diagonal elements
average_off_diagonal = (average_similarity.values.sum() - average_diagonal * 7) / 42
average_off_diagonal

0.31976190476190469

In [98]:
#the difference can be seen as a performance metric of the model 
average_diff = average_diagonal - average_off_diagonal
average_diff

0.31595238095238121

# Vizualisation of candiate vectors

In [21]:
model.docvecs.count

178322

In [101]:
# filter out the candidate data
mask = [tag in candidate_data['from_name'].values for tag in model.docvecs.offset2doctag]  # this is a "list comprehension"
candidate_vecs = model.docvecs.doctag_syn0[mask]

In [102]:
len(candidate_vecs)

1008

In [103]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X = candidate_vecs
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

[Colors in Matplotlib](https://matplotlib.org/api/colors_api.html)

[Color names from xkcd](https://xkcd.com/color/rgb/)

In [104]:
party_colors = {'AfD': 'rgb(0, 0, 153)',
                'DIE LINKE': 'rgb(204, 0, 102)',
                'GRÜNE': 'rgb(0, 153, 0)',
                'CSU': 'rgb(102, 178, 255)',
                'CDU': 'rgb(0, 0, 0)',
                'FDP': 'rgb(255, 255, 51)',
                'SPD': 'rgb(255, 0, 0)'}
candidate_data['color'] = candidate_data['Partei_ABK'].map(party_colors)

In [26]:
#plt.figure(num=None, figsize=(10, 8))  # set the figure size
#plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=candidate_data['color'])
#plt.show()

In [105]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

In [110]:
trace = go.Scatter(x=X_tsne[:, 0], y=X_tsne[:, 1],
                   mode='markers', 
                   marker=dict(color=candidate_data['color'], 
                               colorscale='cmap',
                               showscale=False,
                               line=dict(color='black', width=1)),
                   text=candidate_data['from_name'])
#data.append(trace)
#titles.append("t-SNE (%.2g sec)" % (t1 - t0))

In [107]:
layout=dict(margin=dict(l=10, r=10,
                        t=30, b=10)
           )

fig = go.Figure(data=[trace], layout=layout)

In [None]:
py.iplot(fig)
# I guess I need a plotly account to execute this