In [1]:
%load_ext Cython

In [2]:
%%cython
def f(n):
    a = 0
    for i in range(n):
        a += i
    return a

cpdef g(int n):
    cdef int a = 0, i
    for i in range(n):
        a += i
    return a 

cpdef h(int n):
    cdef int i, a = 0
    for i in range(n):
        a += i
    return a 

In [247]:
%timeit f(10000000)
%timeit g(10000000)
%timeit h(10000000)

394 ms ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.49 ms ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.49 ms ± 7.67 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
import fetch_data as fd
import time

In [None]:
import numpy as np
import itertools as it
from tqdm import tqdm
from math import sqrt
import time

##### NAIVE KERNEL #####

def naive_kernel(data,k,l):
    
    #create all 'good' permutations, that is those which are possible
    perms=[];
    for i in data:
        tmpPerms = list(it.combinations(i,k));
        for j in tmpPerms:
            if(j not in perms):
                perms.append(j);
    
    #init the result matrix
    fi = np.zeros([len(data),len(perms)]);
    
    #calculate each fi_u for each string
    for row in range(len(data)):
        for c, prm in enumerate(perms):
            tmpSum = [];
            indices = [];
            
            #create a vector containing the indexes of the current permutation letters
            for i in range(k):
                indices.append(list(_find(data[row],prm[i])))
            
            #calculate all the values, recursively because i couldn't figure out
            #how to generalize otherwise.
            for i in indices[0]:
                tmpSum = _recursiveShit(indices[1:],i,i,k-2,tmpSum);
            
            #calculate the current fi_u value
            for i in tmpSum:
                fi[row,c] += np.sum(l**(np.array([i])));
                
    return fi;

                

def _recursiveShit(currInd,topVal,prevVal,k,tmpSum):
    #bad recursion, but it works
    
    #base case, check if it's an approved permutation,
    #if yes -> return the sum
    if(k==0):
        for i in currInd[0]:
            if(i>prevVal):
                tmpSum.append(i-topVal+1);
        return tmpSum;
    
    #if not yet at the 'bottom' of the recursion, continue
    else:
        for i in currInd[0]:
            if(i>prevVal):
                tmpSum = _recursiveShit(currInd[1:],topVal,i,k-1,tmpSum);
        return tmpSum
            
            
    

def _find(str, ch):
    #_find indexes of characters in string
    
    for i, ltr in enumerate(str):
        if ltr == ch:
            yield i


#### DYNAMIC PROGRAMMING KERNEL #####

def _k_prime(s,t,n,l):

    #-------------------------------------------------------------------------------------#
    # Basically what's happening here is that we are succesively looping through both of  #
    # the strings and updating the kernel matrix accordingly while refering to previously #
    # computed values. This will give us time complexity O(n|s||t|) in the end.           #
    #-------------------------------------------------------------------------------------#
    
    
    #Variables:
    #
    #s is a string
    #t is a string
    #n is the length of the substring
    #l is the lambda value
    #kp is refering to k'
    #kpp is refering to k'' 
    
    #start by creating the empty matrices.
    kp = np.zeros([n,len(s)+1,len(t)+1]);
    kpp = np.zeros([n,len(s)+1,len(t)+1]);
    
    #initialize
    kp[0][:][:] = 1;
#     print(str(n) + ' ' + str(len(s))+ ' ' +str(len(t)) + '\n')
    for i in range(1,n):
        for j in range(i,len(s)):
            for k in range(i,len(t)+1):

                #check whether 'x occurs in u' as described in the paper
                if(s[j-1]!=t[k-1]):
                    kpp[i][j][k]=l*kpp[i][j][k-1];
                #if not, do the other calcs.
                else:
                    kpp[i][j][k]=l*(kpp[i][j][k-1]+l*kp[i-1][j-1][k-1]);
                
                #finally calculate kp
                kp[i][j][k-1]=l*kp[i][j-1][k-1]+kpp[i][j][k-1];
                
    return kp;


def _k(s,t,n,l,kp):
    
    #--------------------------------------------------#
    # This takes in an already computed k_prime kernel #
    # and calculates the overall kernel as per the     #
    # paper. The last part of Def. 2                   #
    #--------------------------------------------------#

    #Variables:
    #
    #s is a string
    #t is a string
    #n is the length of the substring
    #l is the lambda value
    #kp is refering to k'
    #ksum is refering to the kernel value.
    
    ksum = 0;
    
    #Loop over all values in the computed k_prime matrix and 
    #pick out the values where x = j, as mentioned in the paper.
    #
    #There is no recursion necessary here since we already did it
    #when computing k_prime, the last 'layer' of k_prime
    #contains all the necessary values.

    for i in range(kp.shape[1]-1):
        for j in range(kp.shape[2]-1):
            if s[i] == t[j] : 
                ksum += kp[n-1][i][j];
                
    return l**2*ksum;

In [392]:
for i in range(5-1): print(i)

0
1
2
3


In [None]:
def _get_normed_kernel_values(s,t,n,l):
    
    #------------------------------------------------------#
    # This returns the normalized values for the kernel    # 
    # using the normalization mentioned in the paper.      #
    # s is a string.                                       #
    # t is a string.                                       #
    # n is the substring length                            #           
    # l is the lambda value, the 'weight'                  #
    #------------------------------------------------------#
    
    kstP = _k_prime(s,t,n,l);
    kssP = _k_prime(s,s,n,l);
    kttP = _k_prime(t,t,n,l);

    kst = _k(s,t,n,l,kstP)
    kss = _k(s,s,n,l,kssP);
    ktt = _k(t,t,n,l,kttP);

    return kst/sqrt(kss*ktt)

def recursive_kernel(s,t,n,l):
    if len(s) != len(t):
        print('Number of strings are not equal, reverting to slower, non-square, computation of K')
        K = np.zeros([len(s),len(t)])
        kss = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in s]
        ktt = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in t]
        for i,ss in enumerate(tqdm(s)):
            for j,tt in enumerate(tqdm(t)):
                kst = _k(ss,tt,n,l,_k_prime(ss,tt,n,l))
                #Compute Kernel matrix K, we need to precompute it for
                K[i,j] = kst/sqrt(kss[i]*ktt[j])
        return K

    N = len(s)
    K = np.identity(N)
    
    kss = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in s]

    for i in tqdm(range(N)):
#         for j in tqdm(range(i+1,N)):
        for j in range(i+1,N):
            le_time = time.time()
            kstP = _k_prime(s[i],t[j],n,l);
#             print(time.time()-le_time)
            le_time = time.time()
            kst = _k(s[i],t[j],n,l,kstP)
#             print(time.time()-le_time)
            le_time = time.time()
            #Compute Kernel matrix K, we need to precompute it for
            k = kst/sqrt(kss[i]*kss[j])
#             print(time.time()-le_time)
            le_time = time.time()
            K[i,j] = k
            K[j,i] = k # Using this method to compute half of K and using that the matrix is semi definite

    return K

0.5

In [421]:
recursive_kernel(s,t,n,l)


  0%|          | 0/1769 [00:00<?, ?it/s]
  0%|          | 0/854 [00:00<?, ?it/s][A
[A

Number of strings are not equal, reverting to slower, non-square, computation of K





ZeroDivisionError: float division by zero

In [26]:
#### APPROXIMATIVE KERNEL IMPLEMENTATION #####

def approximative_kernel(x,z,s,n,l):
    N = len(x)
    kss = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in s]
    kxx = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in x]               
    if N == len(z)+1:
        K = np.identity(N)
        print('Square kernel matrix generated')
        for i,xx in enumerate(x):
            for j in range(i+1,N):
                for k,ss in enumerate(s):
                    kxs = _k(xx,ss,n,l,_k_prime(xx,ss,n,l))
                    kzs = _k(z[j],ss,n,l,_k_prime(z[j],ss,n,l)) 
                    k = (kzs*kxs)/(kss[k]*sqrt(kxx[j]*kxx[i]))
                    K[i,j] += k
                    K[j,i] += k
        return K   

    K = np.zeros([N,len(z)])
    kzz = [ _k(i,i,n,l,_k_prime(i,i,n,l)) for i in z]
    for i,xx in enumerate(tqdm(x)):
        for j,zz in enumerate(tqdm(z)):
            for k,ss in enumerate(tqdm(s)):
                kxs = _k(xx,ss,n,l,_k_prime(xx,ss,n,l))
                kzs = _k(zz,ss,n,l,_k_prime(zz,ss,n,l)) 
                K[i,j] += (kzs*kxs)/(kss[k]*sqrt(kzz[j]*kxx[i]))
    return K

In [422]:
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support

# Takes set of all possible labels, and creates a sparse(?) matrix of 1/0 representations of class	
def _labelMaker(labels,categories):
    return MultiLabelBinarizer(classes=categories).fit_transform(labels)

def labelMaker(labels,categories):
    return LabelEncoder().fit_transform(labels)

    '''
        This uses sklearn SVM kit using a one vs rest approach.
        One-vs-one is more computationally intensive than One-vs-all (n(n-1)/2 vs n)
        but less sensitive to imbalanced data (which we do have, big time). This might have to bee looked at
    '''


def generateClassifier(features, labels, n, l, cat):

    ### Generate label representation from ex: 'corn' and 'earn'
    Y = _labelMaker(labels,cat)
#     print(Y)
    ## Generate Kernel matrix module
    print(type(features))
    print(type(n))
    print(type(l))
    print(features)
#     input()
    K = recursive_kernel(features,features,n,l)
#     print('finished recursive kernel')
    clf = OneVsRestClassifier(SVC(kernel='precomputed',probability = True))

    # Return the classifier, god I love how easy this is in python
    return clf.fit(K,Y)

def predict(features,control, classifier, n,l):
    K = recursive_kernel(features,control,n,l)
    return classifier.predict(K)


def score(prediction,labels,categories):
    print('Scores: ' + str(precision_recall_fscore_support(_labelMaker(labels,categories),prediction)))

In [244]:
%%cython
import cython
cimport cython

import numpy as np
cimport numpy as np

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef _process(np.ndarray[DTYPE_t, ndim=2] array):

    cdef unsigned int rows = array.shape[0]
    cdef unsigned int cols = array.shape[1]
    cdef unsigned int row
    cdef np.ndarray[DTYPE_t, ndim=2] out = np.zeros((rows, cols))

    for row in range(0, rows):
        for col in range(0, cols):
            for row2 in range(0, rows):
                out[row, col] += array[row2, col] - array[row, col]

    return out


def process2(np.ndarray[DTYPE_t, ndim=2] array):

    cdef unsigned int rows = array.shape[0]
    cdef unsigned int cols = array.shape[1]
    cdef unsigned int row, col, row2
    cdef np.ndarray[DTYPE_t, ndim=2] out = np.empty((rows, cols))

    for row in range(rows):
        for row2 in range(rows):
            for col in range(cols):
                out[row, col] += array[row2, col] - array[row, col]

    return out

def main():
    cdef np.ndarray[DTYPE_t, ndim=2] data
    cdef np.ndarray[DTYPE_t, ndim=2] out
    data = np.load('data.npy')
    out = _process(data)
    np.save('vialoop.npy', out)

In [349]:
%%cython
import cython
cimport cython

import numpy as np
cimport numpy as np

from libc.stdio cimport printf

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)



cdef le_prime(char* s, char* t, int n, float l, int s_len, int t_len):
#     cdef np.ndarray kp = np.zeros([n,len(s)+1,len(t)+1]);
#     cdef unsigned int rows = array.shape[0]
#     cdef unsigned int cols = array.shape[1]
    cdef unsigned int i, j, k
    
    cdef np.ndarray[DTYPE_t, ndim=3] kp = np.zeros([n,s_len+1,t_len+1])
    cdef np.ndarray[DTYPE_t, ndim=3] kpp = np.zeros([n,s_len+1,t_len+1])
#     cdef int a = 0;
    
    kp[0][:][:] = 1

                
    for i in range(1,n):
        for j in range(i,s_len):
            for k in range(i,t_len+1):
#                 a = a+1
                #check whether 'x occurs in u' as described in the paper
                if(s[j-1]!=t[k-1]):
                    kpp[i][j][k]=l*kpp[i][j][k-1];
                #if not, do the other calcs.
                else:
                    kpp[i][j][k]=l*(kpp[i][j][k-1]+l*kp[i-1][j-1][k-1]);
                
                #finally calculate kp
                kp[i][j][k-1]=l*kp[i][j-1][k-1]+kpp[i][j][k-1];
    
#     print(s)
    return kp
def le_prime_helper(s, t, n, l):
    return le_prime(bytes(s,  'ascii'), bytes(t,  'ascii'), n, l, len(s), len(t))

In [406]:
s = 'det var en gang en apa och ha'
t = 'jag var en apa'
s = s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s+s
t = t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t+t
# s = s[0:200]
# t = t[0:200]

sb = bytes(s,  'ascii')
tb = bytes(t,  'ascii')
s_len = len(s)
t_len = len(t)
n = 10 # Length of subsequence
l = 0.5 # Lambda value

# le_prime(bytearray(s), t, n, l, s_len, t_len)
out = le_prime_helper(s,t,n,l)
s_len

1769

In [366]:
out.shape

(4, 1770, 855)

In [397]:
sum(sum(sum(out)))

1734160.6220209138

In [407]:
_k(s,t,n,l,out)

0.0049909600548296053

In [405]:
l

0.5

In [364]:
out[3][10][:]


array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.00097656,  0.00341797,  0.00854492,  0.01831055,
        0.03900146,  0.02035522,  0.01031494,  0.00515747,  0.00261307,
        0.00130653,  0.00066185,  0.00033092,  0.00028634,  0.00014508,
        0.00105923,  0.00348324,  0.00866874,  0.01849941,  0.03918937,
        0.02046059,  0.0103678 ,  0.0051839 ,  0.00262633,  0.00131316,
        0.00066518,  0.00033259,  0.00028727,  0.00014554,  0.00105947,
        0.00348336,  0.00866881,  0.01849946,  0.03918941,  0.02046061,
        0.01036781,  0.0051839 ,  0.00262633,  0.00131316,  0.00066518,
        0.00033259,  0.00028727,  0.00014554,  0.00105947,  0.00348336,
        0.00866881,  0.01849946,  0.03918941,  0.02046061,  0.01036781,
        0.0051839 ,  0.00262633,  0.00131316,  0.00066518,  0.00033259,
        0.00028727,  0.00014554,  0.00105947,  0.00348336,  0.00866881,
        0.01849946,  0.03918941,  0.02046061,  0.01036781,  0.00

In [380]:
%timeit le_prime_helper(s,t,n,l)
# %timeit le_prime(sb,tb,n,l, s_len, t_len)
# %timeit _k_prime(s,t,n,l)

142 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [303]:
# %%cython
# cpdef k_prime(char[] s,float t,n,l):
def _k_prime(s,t,n,l):

    #-------------------------------------------------------------------------------------#
    # Basically what's happening here is that we are succesively looping through both of  #
    # the strings and updating the kernel matrix accordingly while refering to previously #
    # computed values. This will give us time complexity O(n|s||t|) in the end.           #
    #-------------------------------------------------------------------------------------#
    
    
    #Variables:
    #
    #s is a string
    #t is a string
    #n is the length of the substring
    #l is the lambda value
    #kp is refering to k'
    #kpp is refering to k'' 
    
    #start by creating the empty matrices.
    kp = np.zeros([n,len(s)+1,len(t)+1]);
    kpp = np.zeros([n,len(s)+1,len(t)+1]);
    
    #initialize
    kp[0][:][:] = 1;
#     print(str(n) + ' ' + str(len(s))+ ' ' +str(len(t)) + '\n')
    for i in range(1,n):
        for j in range(i,len(s)):
            for k in range(i,len(t)+1):

                #check whether 'x occurs in u' as described in the paper
                if(s[j-1]!=t[k-1]):
                    kpp[i][j][k]=l*kpp[i][j][k-1];
                #if not, do the other calcs.
                else:
                    kpp[i][j][k]=l*(kpp[i][j][k-1]+l*kp[i-1][j-1][k-1]);
                
                #finally calculate kp
                kp[i][j][k-1]=l*kp[i][j-1][k-1]+kpp[i][j][k-1];
#     print(type(l))
#     print(len(s))
#     print()
#     print()
    return kp;

In [434]:
print(len(trainLabels))

20


In [423]:
categories = ['earn','crude']
numberOfTraining = [10,10]
numberOfTesting = [7,2]


trainData,trainLabels, testData,testLabel = fd.loadData(categories,numberOfTraining,numberOfTesting)

n = 2 # Length of subsequence
l = 0.5 # Lambda value

start_generate = time.time()
 # Generate a classifier that we can use for prediction.
clf = generateClassifier( trainData, trainLabels, n, l,categories)
print('\n Classifier built')

start_pred = time.time()
res = predict(testData,trainData,clf,n,l)
end_pred = time.time()

print('\n Prediction done')
print(res)

score(res,testLabel,categories)

print('\nGenerate Total Time:'+str(start_pred-start_generate))
print('Predict Total Time: '+str(end_pred-start_pred))


<class 'list'>
<class 'int'>
<class 'float'>
['southern california edison co lt sce qtly div qtly div cts vs cts pay april record april ', 'auditors lift qualification brunswick lt bc auditors brunswick corp lifted four year qualification company financial statements vice president finance frederick florjancic told securities analysts financial results diversified leisure defense aerospace company qualifed arthur andersen co since qualification related certain tax liabilities amounting mln dlrs associated medical division sold brunswick said ', 'institute clinical pharm plc lt icpyy year shr ct vs cts net vs revs vs note dollar amounts converted irish pounds noon buying rate federal reserve bank new york dec dlr per one irish pound equivalent rate dec dlr equals one irsh pound full name company institute clinical pharmacology plc based dulbin ireland ', 'ionics inc lt ion th qtr net shr cts vs cts net vs revs mln vs mln year shr cts vs cts net vs revs mln vs mln backlog mln vs mln note

 10%|█         | 2/20 [00:17<02:38,  8.80s/it]

KeyboardInterrupt: 