In [2]:
# Imports
import numpy as np
import pandas as pd
import re
import csv
import random
import math
import sys

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Bidirectional
from keras.optimizers import SGD, RMSprop

In [3]:
#turns a sequence into a question format
# seq - whole dataset interger sequence
# l   - length of the question sequence
# c   - number of answer choices available
# p   - probability of missing element being last in the sequence
def toQuestion(seq,l,c,p):
    i = {}
    l = min(len(seq), l)
    
    #question
    a = random.randint(0,len(seq)-l)
    b = a+l-1
    s = seq[a:b]
    sd = math.ceil(np.std(s))
    if random.random() < p:
        m = len(s)-1              #use last element as missing
    else:
        m = random.randint(0,len(s)-1)   #random missing element
    
    #add sequence
    ans = s[m]
    s[m] = "?"    #set as missing
    i["sequence"] = s
    #i["sequence"] = ",".join([str(j) for j in s])
    
    #make randomly generated choices
    cs = []
    cs.append(ans)   #add right answer
    
    while len(cs) < c:       #add fake answers
        e = math.floor(ans+sd*random.uniform(-2,3))    #vary the answer choices using std and uniform randomness
        if e not in cs:
            cs.append(e)
    
    random.shuffle(cs)       #shuffle order
    i["options"] = cs
    
    #set answer to the missing element and add
    i["answer"] = np.where(np.array(cs)==ans)[0][0]
    
    return i



In [4]:
a = list(csv.reader(open('../../MAIQ/data/synthetic sequences2.csv')))

In [5]:
for item in a:
    item[:] = [x for x in item if x != '']

In [6]:
for item in a:
    item[:] = list(map(lambda x: float(x), item))

In [7]:
seq_class1 = []
for item in a:
    #print (item)
    seq_class1 = seq_class1 + [toQuestion(item, random.randint(5, len(item)), 4, 0.75)['sequence']]

In [8]:
class1 = [1] * len(seq_class1)

In [9]:
#load and clean competition dataset
seqdataIn = pd.read_json('../../MAIQ/data/seq-public.json', orient='records')
seqDataAns =pd.read_json('../../MAIQ/data/seq-public.answer.json',orient='index')

#clean hints
def hint2txt(h):
    return h.replace('<sub>','_{').replace('</sub>','}').replace('<sup>','^{').replace('</sup>','}').replace('\n',' | ').replace('\*','*')
seqDataAns['hint'] = seqDataAns['hint'].map(lambda x: hint2txt(x))

#split the hint by '=', to search item in second term (after '=')
seqDataAns['hint1'] = [item.split('=') for item in seqDataAns['hint']]

In [10]:
#instantiate the classes variable
seqDataAns['classes'] = 0

#assign the class for each sequence
#excludes sequences with error, which remain as class 0
for i in range(len(seqDataAns)):
    if i in [82,  257, 298, 306, 451, 452, 463, 464,468, 476,495, 498,499, 500,504,505,623,624,626,627,628]:
        continue
        
    if seqDataAns['classes'][i] == 0:
        y = False
        z = False
        if len(seqDataAns['hint1'][i])>1:
            y = re.search('A_{', seqDataAns['hint1'][i][1])
            z = re.search('(?<!_).*?n', seqDataAns['hint1'][i][1])

        if (y) and not(z):
            seqDataAns.loc[i, 'classes'] = 1   #recursive A_{n}
            
        elif not(y) and (z):
            seqDataAns.loc[i, 'classes'] = 2   #just the position n
        
        elif not(y) and not(z):
            seqDataAns.loc[i, 'classes'] = 0   #none, or errors
            
        else:
            seqDataAns.loc[i, 'classes'] = 3   #both
            
        

In [11]:
#dataset distribution
seqDataAns.classes.value_counts()

3    511
2    380
0    184
1      1
Name: classes, dtype: int64

In [12]:
#use the sequences/classes to train the classifier
sequences = seqdataIn['stem']


In [13]:
#cleans up sequences
def txt2seq(seq):
    s2 = []
    for s in seq:
        si = re.sub("\(\s?\)","?",s)       #replace parenthesis with question mark
        a = re.split(',|\s|-',si)      #remove any delimiters
        a = [e for e in a if e != ""]  #remove empty strings
        s2.append(a)
    return s2
split_seq = txt2seq(sequences)

In [14]:
for item in seq_class1:
    item[:] = [str(item2) for item2 in item]

In [15]:
split_seq.extend(seq_class1)

In [16]:
classes = seqDataAns.classes

In [17]:
classes = classes.append(pd.Series(class1))
classes = classes.reset_index(drop=True)

In [18]:
classes.value_counts()

3    511
2    380
1    185
0    184
dtype: int64

In [19]:
#checks for valid sequences (all numbers or fractions excluding ?)
def findBadSeq(seq):
    badSeqs = {}
    for a in range(len(seq)):     #check each sequence
        s = seq[a]
        for i in s:   #check each item (assuming sequence is already split)
            if(re.match(r'[^0-9\/\?\-]',i)):    #check if any words or weird characters in the sequence
                badSeqs[a] = s
                break
    return badSeqs

#remove bad sequences from good sequences
def remBadSeq(seq):
    badSeqInd = list(findBadSeq(seq).keys())
    goodSeq = {}
    for i in range(len(seq)):
        if i not in badSeqInd:     #want to keep the indexes to get the answer for it
            goodSeq[i] = seq[i]
    return goodSeq

In [20]:
# show bad sequences removed
bs = findBadSeq(split_seq)
for k,v in bs.items():
    print(str(k) + ":" + str(v))
print("# Bad sequences = " + str(len(bs)) + "/" + str(len(split_seq)))

24:['√2', '3', '√28', '√65', '?']
85:['A', 'F', 'H', 'K', 'N', '?', '?']
86:['A', 'D', 'I', 'P', 'Y', 'CF', 'DI', 'FD', '?']
118:['+1', '1', '1', '1', '?']
161:['19/13', '1', '19/13', '10/22', '?', 'a.7/24;b.7/25;c.5/26;d.7/26;']
297:['√5', '√55', '11√5', '11√55', '?']
367:['0', '3', '1', '6', '√2', '12', '?', '?', '2', '48']
389:['1/2', '√3/2', '√3/3', '?']
411:['(√5', '1)/4', '1/3', '(√3', '1)/2', '√2', '1', '?']
470:['A', 'C', 'F', 'J', 'O', '?', 'What', 'letter', 'comes', 'next?']
487:['72', '(68)', '41', '28', '(98)', '16', '34', '?', '56']
489:['4342', '(3176)', '1726', '7995', '(7516)', '2162', '8418', '?', '1725']
491:['3', '+6', '+2', '3', '+7', '12', '?']
608:['961', '(25)', '432', '932', '?', '731']
609:['16', '(96)', '12', '10', '?', '7.5']
610:['The', 'following', 'series', 'of', 'numbers', 'contains', 'one', 'number', 'that', 'does', 'not', 'fit', 'the', 'pattern', 'set', 'by', 'the', 'others.', 'What', 'number', 'does', 'not', 'fit?', '3', '5', '7', '11', '14', '17']
615

In [21]:
#get only good sequences
gs = remBadSeq(split_seq)
for k,v in gs.items():
    print(str(k) + ":" + str(v))
print("# Good sequences = " + str(len(gs)) + "/" + str(len(split_seq)))

0:['7', '9', '1', '5', '?']
1:['3', '2', '5/3', '3/2', '?']
2:['1', '2', '5', '26', '?']
3:['2', '12', '30', '?']
4:['2', '1', '2/3', '1/2', '?']
5:['4', '2', '2', '3', '6', '?']
6:['1', '7', '8', '57', '?']
7:['4', '12', '8', '10', '?']
8:['1/2', '1', '1', '?', '9/11', '11/13']
9:['95', '88', '71', '61', '50', '?']
10:['2', '6', '13', '39', '15', '45', '23', '?']
11:['1', '3', '3', '5', '7', '9', '13', '15', '?', '?']
12:['1', '2', '8', '28', '?']
13:['0', '4', '18', '?', '100']
14:['1', '1', '2', '2', '3', '4', '3', '5', '?']
15:['1', '52', '313', '174', '?']
16:['5', '15', '10', '215', '?']
17:['7', '0', '1', '2', '9', '?']
18:['0', '1', '3', '10', '?']
19:['9/2', '14', '65/2', '?', '217/2']
20:['1', '1', '2', '6', '24', '?']
21:['3', '4', '8', '24', '88', '?']
22:['20', '22', '25', '30', '37', '?']
23:['1/9', '2/27', '1/27']
25:['1', '2', '4', '8', '16', '?']
26:['2', '1', '2/3', '1/2', '?']
27:['1', '1', '3', '7', '17', '41', '?']
28:['5/2', '5', '25/2', '75/2', '?']
29:['6', '15'

In [22]:
#converts all sequence items to float values
def floatConv(s):
    try:
        return float(s)
    except ValueError:
        try:
            num, denom = s.split('/')
            return float(num) / float(denom)
        except ValueError:
            return 'X'   #bad

#converts all the sequences to floats
def seq2Float(seq):
    floatSeq = {}
    for i in seq.keys():
        s = list(map(lambda x: floatConv(x) if x != "?" else x, seq[i]))
        if "X" in s:  #found a bad one (don't use)
            continue
        else:
            floatSeq[i] = s
    return floatSeq
print(seq2Float(gs))

{0: [7.0, 9.0, 1.0, 5.0, '?'], 1: [3.0, 2.0, 1.6666666666666667, 1.5, '?'], 2: [1.0, 2.0, 5.0, 26.0, '?'], 3: [2.0, 12.0, 30.0, '?'], 4: [2.0, 1.0, 0.6666666666666666, 0.5, '?'], 5: [4.0, 2.0, 2.0, 3.0, 6.0, '?'], 6: [1.0, 7.0, 8.0, 57.0, '?'], 7: [4.0, 12.0, 8.0, 10.0, '?'], 8: [0.5, 1.0, 1.0, '?', 0.8181818181818182, 0.8461538461538461], 9: [95.0, 88.0, 71.0, 61.0, 50.0, '?'], 10: [2.0, 6.0, 13.0, 39.0, 15.0, 45.0, 23.0, '?'], 11: [1.0, 3.0, 3.0, 5.0, 7.0, 9.0, 13.0, 15.0, '?', '?'], 12: [1.0, 2.0, 8.0, 28.0, '?'], 13: [0.0, 4.0, 18.0, '?', 100.0], 14: [1.0, 1.0, 2.0, 2.0, 3.0, 4.0, 3.0, 5.0, '?'], 15: [1.0, 52.0, 313.0, 174.0, '?'], 16: [5.0, 15.0, 10.0, 215.0, '?'], 17: [7.0, 0.0, 1.0, 2.0, 9.0, '?'], 18: [0.0, 1.0, 3.0, 10.0, '?'], 19: [4.5, 14.0, 32.5, '?', 108.5], 20: [1.0, 1.0, 2.0, 6.0, 24.0, '?'], 21: [3.0, 4.0, 8.0, 24.0, 88.0, '?'], 22: [20.0, 22.0, 25.0, 30.0, 37.0, '?'], 23: [0.1111111111111111, 0.07407407407407407, 0.037037037037037035], 25: [1.0, 2.0, 4.0, 8.0, 16.0, '?

In [23]:
#prepare sequences for classifier
valid = seq2Float(gs)  #good sequences dictionary
len(valid)

1217

In [24]:
maxx = 0
for key in valid.keys():
    if len(valid.get(key))>maxx:
        maxx = len(valid.get(key))
maxx

19

In [25]:
secuencias = []
clases = []
for key in valid.keys():
    if key in [82, 257, 298, 306, 451, 452, 463, 464,468, 476,495, 498,499, 500,504,505,623,624,626,627,628, 
               23, 501, 639, 697, 1056, 1073,
               11, 134, 158, 253, 467, 470, 475, 636, 769, 793, 816, 818, 833, 856, 862, 945]:
        continue


    secuencias.append(valid.get(key))
    clases.append(classes[key])

In [26]:
question = [] #position of question mark
seq_len = []
for i in range(len(secuencias)):
    question.append(secuencias[i].index('?'))
    seq_len.append(len(secuencias[i]))
    




In [27]:
secuencias2 = secuencias[:] #replace '?' by zero
for i in range(len(secuencias2)):
    secuencias2[i][question[i]] = 0

In [28]:
secuencias3 = pad_sequences(secuencias2, padding='post') #make all sequences same lenght adding zeros

In [29]:
secuencias3 = secuencias3.tolist()

In [30]:
mask = []
for i in range(len(secuencias3)):
    mask.append([0] * len(secuencias3[0])) 

In [31]:
for i in range(len(mask)):
    for j in range(seq_len[i]):
        mask[i][j] = 1  #assign 1 in the mask sequence for actual values in sequence

In [32]:
for i in range(len(mask)):
    mask[i][question[i]] = 2 #assign 2 in the mask sequence for question mark position

In [33]:
#concatenate the sequence and the mask
secuencias4 = []
for i in range(len(secuencias3)):
    secuencias4.append(secuencias3[i]+mask[i])

In [34]:
#Classifier
#change question mark with '-1' to use the single serie
secuencias5 = secuencias3[:] 
for i in range(len(secuencias5)):
    secuencias3[i][question[i]] = -1





In [35]:
X_train, X_test, y_train, y_test = train_test_split(secuencias4, clases, test_size=0.15, stratify = clases)#, random_state=42) #train using the mask
#X_train, X_test, y_train, y_test = train_test_split(secuencias5, clases, test_size=0.2, random_state=42) #train with just the serie




In [36]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [37]:
X_train = X_train.reshape((len(X_train), len(X_test[0]), 1)) 
X_test = X_test.reshape((len(X_test), len(X_test[0]),1))


y_test = pd.get_dummies(y_test).values
y_train = pd.get_dummies(y_train).values




y_train = y_train.reshape((len(y_train), len(np.unique(clases)), 1))
y_test = y_test.reshape((len(y_test), len(np.unique(clases)), 1))

In [38]:
y_test.shape

(177, 4)

In [39]:
model = Sequential()
#model.add(LSTM(100, input_shape= (19, 1)))
model.add(LSTM(100, input_shape= (38, 1)))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 100, batch_size = 8)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               40800     
_________________________________________________________________
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
Total params: 44,164
Trainable params: 44,164
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/

Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x14a0043d0>

In [40]:
model.save("../classifier/classifier.h5")
#reconstructed_model = keras.models.load_model("../classifier/classifier")