In [182]:
import numpy as np
import pandas as pd
import regex as re

from itertools import product
from functools import reduce

from glob import glob
#from nltk import FreqDist, ConditionalProbDist

from scipy.io import wavfile
from sklearn.preprocessing import normalize

In [124]:
def transition_matrix(data):
    
    matrices = []
    symbols = ['A','o','e','t','p','g','k']
    
    for s in data:
        
        M = np.zeros((len(symbols),len(symbols)))

        for i,j in product(range(len(symbols)),range(len(symbols))):
            pattern = symbols[i]+symbols[j]
            M[i][j] = len(re.findall(pattern=pattern, string=s,overlapped=True))

        matrices.append(M)
        
    return normalize(reduce(lambda a,b : a+b, matrices),norm='l1',axis=1)

#cond_freq_table = np.vectorize(cond_freq_table)

In [152]:
def likelihood(data,tm_language):
    symbols = ['A','o','e','t','p','g','k']
    duplets = re.findall(r'..',data,overlapped=True)

    duplet_cells = []

    for d in duplets:
        duplet_cells.append((symbols.index(d[0]),symbols.index(d[1])))

    return np.product([tm_language[t] for t in duplet_cells])

In [168]:
def classify(data,probs=False):
    lkl_a = likelihood(data,tm_a)
    lkl_b = likelihood(data,tm_b)
    lkl_c = likelihood(data,tm_c)
    
    if probs:
        return np.array([lkl_a,lkl_b,lkl_c])/max(lkl_a,lkl_b,lkl_c)
    else:
        if lkl_a > lkl_b and lkl_a > lkl_c:
            return 'A'
        elif lkl_b > lkl_c and lkl_b > lkl_a:
            return 'B'
        elif lkl_c > lkl_a and lkl_c > lkl_b:
            return 'C'
        else: raise('ValueError')
            
classify = np.vectorize(classify)

In [38]:
data = {}

data['training'] = {}
for l in ['A','B','C']:
    data['training'][l] = [open(f,'r').read() for f in glob('symbol/language-training-lang{}*'.format(l))]

data['test'] = [open(f,'r').read() for f in glob('symbol/language-test*')]

In [126]:
tm_a = transition_matrix(data['training']["A"])
tm_b = transition_matrix(data['training']["B"])
tm_c = transition_matrix(data['training']["C"])

In [153]:
likelihood(data['test'][0],tm_a)

1.530622072665416e-104

In [181]:
pd.DataFrame(np.column_stack((data['test'],classify(data['test']))), columns=['string','language'])


Unnamed: 0,string,language
0,AtAgegegegAgegetoAtetAogAooAoeAtegAgeotAoAgoet...,B
1,popoktgopogettpegAtkegotkogettpApepApegtgApeog...,A
2,okopottogttpetgopopotkepeppAgAgetpotpAtgookett...,A
3,ekogoAgkepokogoppAttpAttgeekApegepApotpAAtpetg...,A
4,tgAkoggAggAtpetkpAgegAgkepepookepepekogokogetk...,A
5,pppooootgAookggggtttopAtttkkkeeggeeeeAAAgtkoAk...,C
6,ApgotgAApogotgopAgegAtkAgketgAtkpAgottpopopAgp...,A
7,oeeoppppppoooAgggggoAAoookkppeoAApeoAkAAAAAAAp...,C
8,gooooAAAAAAAAAkkkkkkooooAAAeppppppgeeeeepAAppe...,C
9,kteoeoeoAoteoAgeoetoegAgeoekgeteoegetetAgeAoAt...,B


(44100,
 array([-10063, -12498, -14671, ...,  -3456,  -6123,  -7756], dtype=int16))