# Set up

In [None]:
import pandas as pd
import numpy as np
import os
from random import randint
from nltk import RegexpTokenizer
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords
import string
import scipy.stats

In [None]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(parent_dir, 'data')
models_dir = os.path.join(parent_dir, 'models')
print('working directory: ', os.getcwd())
print('data directory:    ', data_dir, )
print('models directory:  ', models_dir)

In [None]:
# change pandas column width so we can see the posts
pd.get_option('max_colwidth')
pd.set_option('max_colwidth', 2000)

# Prepare data 

In [None]:
# read the data from disk
data = pd.read_pickle(os.path.join(data_dir, 'data_clean_4cols_2.pickle'))

In [None]:
# join all messages by the same candidate
candidate_data = pd.DataFrame()
candidate_data = (data[['Partei_ABK', 'from_name']].drop_duplicates('from_name')
                                                   .set_index('from_name'))
candidate_data['messages'] = data.groupby('from_name')['message'].apply(' '.join)
candidate_data.head(2)

In [None]:
n = 50
candidate_data = candidate_data.sample(n=n)

## Tokenization and cleaning

In [None]:
string.punctuation

In [None]:
stopwords.words('german')

In [None]:
# define a function fo cleaning and tokenization
def nlp_clean(messages):
    cleaned = []
    for message in messages:
        message = message.lower()
        message = TweetTokenizer().tokenize(message)
        words = [word for word in message if (word not in stopwords.words('german')
                                              and word not in string.punctuation + '„“‘´'
                                              and not word.startswith('http')
                                              and not word.isdigit())]
        cleaned.append(words)
    return cleaned

In [None]:
documents = nlp_clean(candidate_data['messages'])

In [None]:
# Choose a random document/candidate
i = randint(0, n-1)
i

In [None]:
# show the document 
candidate_data['messages'][i]

In [None]:
# show the tokenized and  cleaned document 
documents[i]

# Build the vocabulary

In [None]:
# make a list containing all words in the corpus
vocab = [word for words in documents for word in words]

In [None]:
# keep a set of unique words
vocab = list(set(word_list))

In [None]:
m = len(vocab)
m

# construct term vectors

In [None]:
target_word = 'steuern'
idx = vocab.index(target_word)

In [None]:
t = np.array(np.zeros(m), ndmin=2).T
t[vocab.index(target_word)] = 1
t#.shape

In [None]:
t[idx-5:idx+5]

In [None]:
window_size=8
doc_words = documents[i]
middle = randint(window_size, len(doc_words) - window_size - 1)
#window_words = [words[c] for c in range(middle - window_size, middle + window_size)]
#print(window_words)

window_words = []
t = []
for c in range(middle - window_size, middle + window_size):
    window_words.append(doc_words[c])
    tt = (np.array(np.zeros(m), ndmin=2).T)
    tt[vocab.index(doc_words[c])] = 1
    t.append(tt)
t[-1][vocab.index(words[c])-5:vocab.index(words[c])+5]

# Feed-forward

In [None]:
d = np.array(np.zeros(n), ndmin=2).T
d[i] = 1
d.shape

In [None]:
# p = dimensions of document vectors (no. of features)
p = 100
D = np.random.rand(p, n)
D.shape

In [None]:
U = scipy.stats.truncnorm.rvs(-2, 2, loc=0, scale=1, size=(m, p))
U.shape

In [None]:
e = np.array(np.dot(D, d), ndmin=2)
e.shape

In [None]:
k = np.array(np.dot(U, e), ndmin=2)
k#.shape

In [None]:
def softmax(k):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(k) / np.sum(np.exp(k), axis=0)  # axis=1 for row-vector, axis=0 for column-vector

In [None]:
t_hat = softmax(k)
t_hat.shape

In [None]:
np.sum(t_hat)

In [None]:
def cross_entropy_loss(t, t_hat):
    return - np.dot(t.T, np.log(t_hat))

In [None]:
cross_entropy_loss(t, t_hat)#[0][0]

# Backpropagation

In [None]:
errors_out = t_hat - t[0]
errors_out#.shape

In [None]:
errors_middle = np.dot(U.T, errors_out)
errors_middle.shape

In [None]:
# define a learning rate
alpha = 0.025

In [None]:
U += - alpha * np.dot(errors_out, e.T)

In [None]:
D += - alpha * np.dot(errors_middle, d.T)

# Putting it together

In [None]:
window_size = 8
alpha = 0.025  # learning rate
p = 100  # p = dimensions of document vectors (no. of features)
m = len(vocab)  # number of words in the corpus 

D = np.random.rand(p, n)  # matrix of document embeddings
U = scipy.stats.truncnorm.rvs(-2, 2, loc=0, scale=1, size=(m, p))  # matrix of softmax weights

In [None]:
epochs = 1
for e in range(epochs):
    for i in range(n):
        # Feed-forward
        d = np.array(np.zeros(n), ndmin=2).T
        d[i] = 1
        
        e = np.array(np.dot(D, d), ndmin=2)
        k = np.array(np.dot(U, e), ndmin=2)
        t_hat = softmax(k)
        
        doc_words = documents[i]
        middle = randint(window_size, len(doc_words) - window_size - 1)
        #window_words = [words[c] for c in range(middle - window_size, middle + window_size)]
        #t[] = np.array(np.zeros(len(vocab)), ndmin=2).T
        #for w in window_words:
        #    t[vocab.index(w)] = 1

        #window_words = []
        #k = []
        
        # Backprogation
        errors_out = (np.array(np.zeros(m), ndmin=2).T)
        errors_middle = (np.array(np.zeros(p), ndmin=2).T)
        for c in range(middle - window_size, middle + window_size):
            t = (np.array(np.zeros(len(vocab)), ndmin=2).T)
            t[vocab.index(doc_words[c])] = 1
            errors_out += t_hat - t
            errors_middle += np.dot(U.T, errors_out)
            
            if c == middle:
                print(cross_entropy_loss(t, t_hat))

        #errors_out = t_hat - t
        #errors_middle = np.dot(U.T, errors_out)
        U += - alpha * np.dot(errors_out, e.T)
        D += - alpha * np.dot(errors_middle, d.T)
        
    

# Visualize the document/candidate vectors

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(n_components=2)

In [None]:
party_colors = {'AfD': 'xkcd:blue',
                'DIE LINKE': 'xkcd:magenta',
                'GRÜNE': 'xkcd:grass green',
                'CSU': 'xkcd:sky blue',
                'CDU': 'xkcd:black',
                'FDP': 'xkcd:goldenrod',
                'SPD': 'xkcd:red'}
candidate_data['color'] = candidate_data['Partei_ABK'].map(party_colors)

In [None]:
D_tsne = tsne.fit_transform(D)
plt.figure(num=None, figsize=(10, 8))  # set the figure size
plt.scatter(D_tsne[:, 0], D_tsne[:, 1], c=candidate_data['color'])
plt.show()

In [None]:
D_tsne = tsne.fit_transform(D)
plt.figure(num=None, figsize=(10, 8))  # set the figure size
plt.scatter(D_tsne[:, 0], D_tsne[:, 1], c=candidate_data['color'])
plt.show()