Experiment with dpseg segmentation.
5-fold validation.

In [1]:
from gensim.models import Word2Vec
import annoy # fast approximate KNN
import numpy as np
from collections import Counter

In [2]:
!head ./UP000002311.goByGeneC

gene seq annot
"P32317" "MIALKPNAVRTFRQVQHCSFRICRYQSTKSNKCLTPLQEYDRLVKLGKLRDDTYQRGIISSLGDLYDSLVKYVPPVVKTPNAVDQVGGWLNGLKSVFSRGKPKNIGAYVDVSKIGNSIPRGVYLYGDVGCGKTMLMDLFYTTIPNHLTKKRIHFHQFMQYVHKRSHEIVREQNLKELGDAKGKEIDTVPFLAAEIANNSHVLCFDEFQVTDVADAMILRRLMTALLSDDYGVVLFATSNRHPDELYINGVQRQSFIPCIELIKHRTKVIFLNSPTDYRKIPRPVSSVYYFPSDTSIKYASKECKTRRETHIKEWYNYFAQASHTDDSTDSHTVHKTFYDYPLTIWGREFKVPKCTPPRVAQFTFKQLCGEPLAAGDYLTLAKNFEAFIVTDIPYLSIYVRDEVRRFITFLDAVYDSGGKLATTGAADFSSLFVEPEQILNDFELRPTTKEPDSVDTGMVDEMVEKHGFSKEIAKKSQMFALDEERFAFARALSRLSQMSSTDWVTKPTY" "GO:0005575, GO:0005622, GO:0005623, GO:0005737, GO:0005739, GO:0043226, GO:0043227, GO:0043229, GO:0043231, GO:0044424, GO:0044444, GO:0044464"
"P87275" "MIEEKKELKKRRVLQMARFYGAAAFTLITMRLISRAIKVRKYVPSIFQQNYKLPPFSQRNEAMSALTYASAASIGTFSTLIFGFCWALDISTAREFVFKTREFMSLPQALETDTSMDEETSKLTKQLQDLLSSENNK" "GO:0005575, GO:0005622, GO:0005623, GO:0005737, GO:0005739, GO:0043226, GO:0043227, GO:0043229, GO:0043231, GO:0044424, GO:0044444, GO:0044464"
"P39010" "MVNELENVPRAS

In [3]:
good_labels = set(['GO:0005634',# nucleus
'GO:0005576',# extracellular
'GO:0005829',# cytosol
'GO:0005856',# cytoskeleton
'GO:0005739',# mitochondrion
'GO:0005886',# cell_membrane
'GO:0005783',# endoplasmic_reticulum
'GO:0005794',# golgi_apparatus
'GO:0005773',# vacuole
'GO:0005777',# peroxysome
              ])

In [4]:
X = [] # sequence strings
Y = [] # list of labels for each sequence
all_y = set() # all the labels collected from the dataset
with open('./UP000002311.goByGeneC') as inp:
    with open('../dpseg-1.2.1/myout.words') as inp2:
        inp.readline() # header
        for line, line2 in zip(inp,inp2):
            tokens = map(lambda x: x.strip('"'), line.split(' ',2))

            object_labels = [t.strip().strip('"') for t in tokens[2].split(',') if len(t.strip())>0 and t.strip]
            object_labels = [y for y in object_labels if y in good_labels]
            if len(object_labels)>=1:
                X.append(line2.strip().split())
                Y.append(object_labels)
                all_y = all_y.union(set(Y[-1]))

In [5]:
print 'example of X', X[:3]
print 'example of Y', Y[:3]
print 'example of set of labels', list(all_y)[:3]
print 'total sequences', len(X)
print 'total labels', len(all_y)

example of X [['M', 'IA', 'L', 'K', 'PN', 'A', 'V', 'RT', 'FR', 'Q', 'VQ', 'HC', 'S', 'FR', 'I', 'C', 'R', 'YQ', 'STK', 'SN', 'KC', 'LTP', 'LQE', 'Y', 'DRL', 'VK', 'L', 'G', 'KLR', 'DD', 'TY', 'Q', 'R', 'G', 'II', 'SS', 'LG', 'DL', 'YD', 'SL', 'V', 'KY', 'V', 'P', 'P', 'VV', 'K', 'TP', 'N', 'AV', 'D', 'Q', 'V', 'G', 'G', 'W', 'LN', 'G', 'L', 'K', 'SV', 'F', 'SR', 'G', 'K', 'PK', 'N', 'IG', 'A', 'Y', 'V', 'DV', 'SK', 'IG', 'N', 'SI', 'P', 'RG', 'VY', 'L', 'Y', 'GD', 'V', 'GCG', 'KT', 'M', 'L', 'MD', 'LF', 'YT', 'T', 'IP', 'N', 'HL', 'TK', 'KR', 'IH', 'F', 'H', 'QF', 'M', 'QY', 'VH', 'K', 'RS', 'H', 'EI', 'V', 'R', 'E', 'Q', 'N', 'LKEL', 'GD', 'A', 'K', 'G', 'K', 'EI', 'D', 'T', 'V', 'PFL', 'AAE', 'I', 'AN', 'NS', 'HV', 'L', 'C', 'F', 'DEF', 'Q', 'V', 'T', 'DV', 'AD', 'A', 'M', 'IL', 'RRL', 'M', 'T', 'A', 'L', 'LS', 'D', 'D', 'YG', 'VV', 'L', 'F', 'A', 'TS', 'N', 'R', 'HPD', 'E', 'LY', 'IN', 'G', 'V', 'QRQ', 'SF', 'IP', 'CI', 'ELI', 'K', 'HR', 'TKV', 'IF', 'LN', 'SPT', 'DY', 'RKI', 'PR',

In [6]:
def cv_iter(w2v_params, nn_num):    
        
    """
    one ieration of cross-validation
    w2v_params is a dictionary with parameters of gensim.Word2Vec
    """
    indices = range(len(X))
    np.random.shuffle(indices)

    Xtrain = np.array(X)[indices[:len(indices)/2]]
    Ytrain = np.array(Y)[indices[:len(indices)/2]]
    Xtest = np.array(X)[indices[len(indices)/2:]]
    Ytest = np.array(Y)[indices[len(indices)/2:]]
    # split all the strings into lists for word2vec traing
    Xtrain = [list(x) for x in Xtrain]
    Xtest = [list(x) for x in Xtest]
    model = Word2Vec(Xtrain, **w2v_params)
    index = annoy.AnnoyIndex(w2v_params.get('size', 100)) # default value is 100 for gensim.Word2Vec
    for i in range(len(Xtrain)):
        v= np.mean([model[c] for c in Xtrain[i] if c in model], axis=0) # centroid
        if np.isnan(np.mean(v)):
            continue
        index.add_item(i, v)
    index.build(100) # 100 is a number of trees in index, we can alibrate this value
    predicted = []
    for i in range(len(Xtest)):
        v= np.mean([model[c] for c in Xtest[i]  if c in model], axis=0)
        if np.isnan(np.mean(v)):
            ids = []
        else:
            ids = index.get_nns_by_vector(v,nn_num) #  finding 1 nearest neighbour
        predicted_labels = []
        for id in ids:
            predicted_labels.extend(Ytrain[id])
        if len(predicted_labels)>0:
            cnter = Counter(predicted_labels)
            max_val = max(cnter.values())
            predicted.append([k for k,v in cnter.items() if v == max_val])
        else:
            predicted.append([])
    tp = 0
    fp = 0
    true_label_num = 0
    for i in xrange(len(Xtest)):
        true = set(Ytest[i])
        true_label_num+=len(true)
        pred = set(predicted[i])
        tp += len(true&pred)
        fp += len(pred) - len(true&pred)

    prec=tp*1.0/(tp+fp)
    rec = tp*1.0/true_label_num
    return prec, rec,  2 * prec * rec/ (prec+rec)
    

In [7]:
def cv(w2v_params = {},  cv_num=5, nn_num=1):
    """
    Cross-validation function.
    Returns list with precision, recall and F1 values
    """
    precs, recs, fs = [],[],[]
    for _ in range(cv_num):
        p,r,f = cv_iter(w2v_params , nn_num=nn_num)
        precs.append(p)
        recs.append(r)
        fs.append(f)
    return precs, recs, fs

In [8]:
p,r,f = cv(nn_num=1)
np.mean(p), np.mean(r), np.mean(f)

(0.3380580084716379, 0.3446192771742729, 0.34126854663591494)

In [9]:
p,r,f = cv(nn_num=2)
np.mean(p), np.mean(r), np.mean(f)

(0.32355996427380057, 0.4552451894426387, 0.3782447191409461)

In [10]:
p,r,f = cv(nn_num=5)
np.mean(p), np.mean(r), np.mean(f)

(0.4184919935751255, 0.40946009454937415, 0.4139009442855791)

In [11]:
p,r,f = cv(nn_num=10)
np.mean(p), np.mean(r), np.mean(f)

(0.466098212954524, 0.3996496482483261, 0.4303217021068054)

In [12]:
p,r,f = cv(nn_num=15)
np.mean(p), np.mean(r), np.mean(f)

(0.48437204683992735, 0.39640498020503107, 0.4359767202504729)

In [13]:
p,r,f = cv(nn_num=20)
np.mean(p), np.mean(r), np.mean(f)

(0.4924475567166704, 0.39267578556032345, 0.43692428123592053)