In [26]:
%load_ext autoreload
%autoreload 2

from raxutil.ml.dataHandler import *
from raxutil.ml.modelBuilder import *
from raxutil.ml.templateAnalysis import *

import cssutils
from functools import reduce
import os
from os.path import basename
import csv, ast
from glob import glob
from collections import Counter, defaultdict
from nltk.classify import MaxentClassifier 
import pandas as pd
import pymongo
from pymongo import MongoClient
from pymongo import TEXT

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
client = MongoClient("localhost:27017")
db=client.raxdb
faresheet = 'CXfaresheets_new'
fs = db['CXfaresheets_new']

### 1. Collect Training data from all markets

In [28]:
collector = DataTransfer(db, faresheet)
docs = collector.collect_for_train(classification="Commission", country='ALL')

Collect 8224 documents


In [29]:
from .ml.dataHandler import DataTransformer
transformer = DataTransformer()
X_dat, Y_dat, files_index = transformer.data_construct(target_docs=docs)

In [30]:
# check imbalance
from collections import Counter
Counter(Y_dat)

Counter({'no': 7405, 'yes': 752})

In [31]:
X_dat, Y_dat, feature_names, feature_index = transformer.train_data_transform(X_dat, Y_dat, Y_map={'yes': 1, 'no': 0})

### The dataset is very imbalanced, Over-Sampling the positive data

In [32]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=24)
X_resampled, y_resampled = ros.fit_sample(X_dat, Y_dat)

In [33]:
from collections import Counter
Counter(y_resampled)

Counter({0: 7405, 1: 7405})

In [34]:
X_resampled.shape

(14810, 1138)

### Split the data to training and validation

In [327]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

In [328]:
# turn y train to a numpy array
y_train = np.array([[y, 1-y] for y in y_train])

In [329]:
X_train = np.array(X_train)

In [330]:
X_test = np.array(X_test)

In [331]:
y_test = np.array([[y, 1-y] for y in y_test])

In [332]:
y_train.shape

(11848, 2)

In [333]:
X_train.shape

(11848, 1138)

In [305]:
# function to make mini batches for training
import math
def make_batch(X_train, y_train, batch_size):
    batch_num = math.floor(len(X_train) /batch_size)
    for num in range(batch_num):
        if num == batch_num-1:
            index = range(0+batch_size*num, len(X_train))
            yield X_train[index], y_train[index]
        else:
            index = range(0+batch_size*num, batch_size+batch_size*num)
            yield X_train[index], y_train[index]

In [306]:
# test code
test_x = np.array([[1,2], [3,4], [5,6], [7,6], [8,6], [9,6], [10,6]])
test_y = np.array([[1], [2], [3], [1], [2], [3], [3]])
for x, y in make_batch(test_x, test_y, 2):
    print(x)
    print(y)

[[1 2]
 [3 4]]
[[1]
 [2]]
[[5 6]
 [7 6]]
[[3]
 [1]]
[[ 8  6]
 [ 9  6]
 [10  6]]
[[2]
 [3]
 [3]]


### 2. Modelling

### A three layer neural network

In [411]:
# Python optimisation variables
learning_rate = 0.01
epochs = 600
batch_size = 100
d=1138
c=2

In [412]:
# declare the training data placeholders
x = tf.placeholder(tf.float32, [None, d])
# now declare the output data placeholder - 10 digits
y = tf.placeholder(tf.float32, [None, c])

In [413]:
# now declare the weights connecting the input to the hidden layer
W1 = tf.Variable(tf.random_normal([d, 200], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random_normal([200]), name='b1')
# and the weights connecting the hidden layer to the output layer
W2 = tf.Variable(tf.random_normal([200, c], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random_normal([c]), name='b2')

In [414]:
# calculate the output of the hidden layer
hidden_out = tf.add(tf.matmul(x, W1), b1)
hidden_out = tf.nn.relu(hidden_out)

In [415]:
# calculate the hidden layer output - in this case, let's use a softmax activated
# output layer
y_ = tf.nn.softmax(tf.add(tf.matmul(hidden_out, W2), b2))

In [416]:
# cost or loss function for the optimisation/backpropagation
y_clipped = tf.clip_by_value(y_, 1e-10, 0.9999999)
cross_entropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped) + (1 - y) * tf.log(1 - y_clipped), axis=1))

In [417]:
# add an optimiser
optimiser = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

In [418]:
# finally setup the initialisation operator
init_op = tf.global_variables_initializer()
# define an accuracy assessment operation
pred = tf.argmax(y_, 1)
pred_prob = y_
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [419]:
# start training
with tf.Session() as sess:
    # initialise the variables 
    sess.run(init_op)
    for epoch in range(epochs):
        avg_cost=0
        for batch_x, batch_y in make_batch(X_train, y_train, batch_size):
            _, c = sess.run([optimiser, cross_entropy], feed_dict={x: batch_x, y: batch_y})
            batch_num = math.floor(len(X_train) /batch_size)
            avg_cost += c/batch_num 
        print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost))
    print(sess.run(accuracy, feed_dict={x: X_test, y: y_test}))
    print(sess.run(correct_prediction, feed_dict={x: X_test, y: y_test}))
    preds = sess.run(pred, feed_dict={x: X_test, y: y_test})
    preds_probs = sess.run(pred_prob, feed_dict={x: X_test, y: y_test})
      
    #golds = sess.run(gold, feed_dict={x: X_test, y: y_test})

Epoch: 1 cost = 0.644
Epoch: 2 cost = 0.304
Epoch: 3 cost = 0.211
Epoch: 4 cost = 0.166
Epoch: 5 cost = 0.139
Epoch: 6 cost = 0.120
Epoch: 7 cost = 0.105
Epoch: 8 cost = 0.094
Epoch: 9 cost = 0.084
Epoch: 10 cost = 0.076
Epoch: 11 cost = 0.070
Epoch: 12 cost = 0.065
Epoch: 13 cost = 0.060
Epoch: 14 cost = 0.056
Epoch: 15 cost = 0.053
Epoch: 16 cost = 0.050
Epoch: 17 cost = 0.047
Epoch: 18 cost = 0.045
Epoch: 19 cost = 0.043
Epoch: 20 cost = 0.041
Epoch: 21 cost = 0.039
Epoch: 22 cost = 0.038
Epoch: 23 cost = 0.036
Epoch: 24 cost = 0.035
Epoch: 25 cost = 0.034
Epoch: 26 cost = 0.032
Epoch: 27 cost = 0.031
Epoch: 28 cost = 0.030
Epoch: 29 cost = 0.029
Epoch: 30 cost = 0.029
Epoch: 31 cost = 0.028
Epoch: 32 cost = 0.027
Epoch: 33 cost = 0.026
Epoch: 34 cost = 0.026
Epoch: 35 cost = 0.025
Epoch: 36 cost = 0.024
Epoch: 37 cost = 0.024
Epoch: 38 cost = 0.023
Epoch: 39 cost = 0.023
Epoch: 40 cost = 0.022
Epoch: 41 cost = 0.022
Epoch: 42 cost = 0.022
Epoch: 43 cost = 0.021
Epoch: 44 cost = 0.0

Epoch: 347 cost = 0.003
Epoch: 348 cost = 0.003
Epoch: 349 cost = 0.003
Epoch: 350 cost = 0.003
Epoch: 351 cost = 0.003
Epoch: 352 cost = 0.003
Epoch: 353 cost = 0.003
Epoch: 354 cost = 0.003
Epoch: 355 cost = 0.003
Epoch: 356 cost = 0.003
Epoch: 357 cost = 0.003
Epoch: 358 cost = 0.003
Epoch: 359 cost = 0.003
Epoch: 360 cost = 0.003
Epoch: 361 cost = 0.003
Epoch: 362 cost = 0.003
Epoch: 363 cost = 0.003
Epoch: 364 cost = 0.003
Epoch: 365 cost = 0.003
Epoch: 366 cost = 0.003
Epoch: 367 cost = 0.003
Epoch: 368 cost = 0.003
Epoch: 369 cost = 0.003
Epoch: 370 cost = 0.003
Epoch: 371 cost = 0.003
Epoch: 372 cost = 0.003
Epoch: 373 cost = 0.003
Epoch: 374 cost = 0.003
Epoch: 375 cost = 0.003
Epoch: 376 cost = 0.003
Epoch: 377 cost = 0.003
Epoch: 378 cost = 0.003
Epoch: 379 cost = 0.003
Epoch: 380 cost = 0.003
Epoch: 381 cost = 0.003
Epoch: 382 cost = 0.003
Epoch: 383 cost = 0.003
Epoch: 384 cost = 0.003
Epoch: 385 cost = 0.003
Epoch: 386 cost = 0.003
Epoch: 387 cost = 0.003
Epoch: 388 cost 

In [385]:
preds

array([0, 0, 1, ..., 0, 1, 0])

In [386]:
golds

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)