In [1]:
#%matplotlib inline
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder

import theano
import theano.tensor as T

### Reading inputs

In [63]:
answer_cols = ["Ans_"+str(i+1) for i in range(30)]
N = len(answer_cols)   # Number of questions
L = 6                # Number of possible answers

## Reading user info
U = pd.read_csv('input_to_algorithm.csv', index_col=0, low_memory=False)
M = len(U) # Number of users
assert(U.shape[1],N)
print "Users input size (M x N): ", U.shape

ans_enc = OneHotEncoder(sparse=False)
U = ans_enc.fit_transform(U)
U = U.reshape(M, N, L)
print "One-hot (M x N x L):", U.shape, "\n"

## Reading voting intention
V = pd.read_csv('voting_intention.csv', header=None,index_col=0)
assert(len(V), M)
print "Voting intention input size (M):", len(V)

party_enc = OneHotEncoder(sparse=False)
V = party_enc.fit_transform(V)
K = V.shape[1] # Number of parties
print "One-hot (M x K)", V.shape, "\n"

## Reading party info
P = pd.read_csv('es_party_XYZ.csv', delim_whitespace=True)
P = P[answer_cols]
P[P==99] = 6
assert(K, len(P))
print "Parties input size (K x N): ", P.shape

P = ans_enc.transform(P)
P = P.reshape(len(P), len(answer_cols), L)
print "One-hot (K x N x L):", P.shape

Users input size (M x N):  (99821, 30)
One-hot (M x N x 6): (99821, 30, 6) 

Voting intention input size (M): 99821
One-hot (M x K) (99821, 10) 

Parties input size (K x N):  (10, 30)
One-hot (K x N x 6): (10, 30, 6)


### Building model

In [30]:
N = 30
L = 6

# Symbolic variables
u = T.dtensor3("u")
p = T.dtensor3("p")
v = T.dtensor3("v")

rng = np.random

# Define learnable parameters (with random initialization)
D_flat = rng.randn(N, 13) # 13 independent weights in the bi-symmetrical distance matrix
D = np.empty([L, N, L])
for j in range(N):
    D_w = D_flat[j]
    D[:,j,:] = [[D_w[0], D_w[1], D_w[3], D_w[6], D_w[8], D_w[9]],
                [D_w[1], D_w[2], D_w[4], D_w[7], D_w[6], D_w[10]],
                [D_w[3], D_w[4], D_w[5], D_w[4], D_w[3], D_w[11]],
                [D_w[6], D_w[7], D_w[4], D_w[2], D_w[1], D_w[10]],
                [D_w[8], D_w[6], D_w[3], D_w[1], D_w[0], D_w[9]],
                [D_w[9], D_w[10], D_w[11], D_w[10], D_w[9], D_w[12]]]

D = theano.shared(D, name="D")
w = theano.shared(rng.randn(N), name="w")

# Compute distance scores
s = T.batched_dot(u.dimshuffle((1,0,2)), D.dimshuffle((1,0,2)))
s = T.batched_dot(s, p.dimshuffle((1,0,2)))

# Aggregate issues with weights
#w = w.dimshuffle((0,'x','x'))
s = T.tensordot(s, w, axes=[[0],[0]])

In [None]:
rng = np.random

# generate a dataset: D = (input_values, target_class)
D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
training_steps = 10000

# Declare Theano symbolic variables
x = T.dmatrix("x")
y = T.dvector("y")

# initialize the weight vector w randomly
#
# this and the following bias variable b
# are shared so they keep their values
# between training iterations (updates)
w = theano.shared(rng.randn(feats), name="w")

# initialize the bias term
b = theano.shared(0., name="b")

print("Initial model:")
print(w.get_value())
print(b.get_value())

# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
prediction = p_1 > 0.5                    # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
                                          # w.r.t weight vector w and
                                          # bias term b
                                          # (we shall return to this in a
                                          # following section of this tutorial)

# Compile
train = theano.function(
          inputs=[x,y],
          outputs=[prediction, xent],
          updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
predict = theano.function(inputs=[x], outputs=prediction)

# Train
for i in range(training_steps):
    pred, err = train(D[0], D[1])

print("Final model:")
print(w.get_value())
print(b.get_value())
print("target values for D:")
print(D[1])
print("prediction on D:")
print(predict(D[0]))