In [1]:
import pickle
import numpy as np
from collections import Counter

In [2]:
# load model
with open('best_results/best_nonrandom/trained_mdl.pickle', 'rb') as f:
    trained_model = pickle.load(f)

In [3]:
with open('best_results/best_nonrandom/possible_6mers.pickle', 'rb') as f:
    possible_6mers = pickle.load(f)

In [4]:
trained_model.coef_.shape

(15, 4096)

In [5]:
meaning = [
    (1, 2),
    (1, 3),
    (1, 4),
    (1, 5),
    (1, 6),
    (2, 3),
    (2, 4),
    (2, 5),
    (2, 6),
    (3, 4),
    (3, 5),
    (3, 6),
    (4, 5),
    (4, 6),
    (5, 6)
]

In [6]:
def most_predictive_kmers(class_num, coef, top_k=1):
    ret = []
    for meaning_tuple, tuple_coefs in zip(meaning, coef):
        if class_num not in meaning_tuple:
            continue
        if class_num == meaning_tuple[0]:
            best_index = (-tuple_coefs).argsort()[:top_k]
        else:
            best_index = (tuple_coefs).argsort()[:top_k]
        print("{} {}".format(
            meaning_tuple,
            [
                (bi, possible_6mers[bi], tuple_coefs[bi])
                for bi in best_index
            ]
        ))
        ret.append((meaning_tuple, [
                (bi, possible_6mers[bi], tuple_coefs[bi])
                for bi in best_index
            ]))
    return ret

In [7]:
ret_1 = most_predictive_kmers(1, trained_model.coef_, top_k=3)

(1, 2) [(1084, 'CAATTA', 0.01773984944202841), (3959, 'TTCTCT', 0.017719768815730508), (2007, 'CTTCCT', 0.017686474873381423)]
(1, 3) [(1503, 'CCTCTT', 0.02270160551700845), (2044, 'CTTTTA', 0.020997951593760625), (4064, 'TTTGAA', 0.019826044925772945)]
(1, 4) [(240, 'AATTAA', 0.017367417656043375), (3840, 'TTAAAA', 0.016935983182081643), (3132, 'TAATTA', 0.016529685242000363)]
(1, 5) [(4030, 'TTGTTG', 0.021667983254479637), (4062, 'TTTCTG', 0.020819795775675942), (1017, 'ATTTGC', 0.019409410453400465)]
(1, 6) [(240, 'AATTAA', 0.019384334099092043), (2044, 'CTTTTA', 0.01843625139698457), (3388, 'TCATTA', 0.017368551722240243)]


In [8]:
ret_2 = most_predictive_kmers(2, trained_model.coef_, top_k=3)

(1, 2) [(3279, 'TATATT', -0.024572626719548102), (4095, 'TTTTTT', -0.02069161747017481), (633, 'AGCTGC', -0.02005186620489171)]
(2, 3) [(2044, 'CTTTTA', 0.01593118697267573), (1023, 'ATTTTT', 0.015392210894668037), (2, 'AAAAAG', 0.015216483475749658)]
(2, 4) [(1938, 'CTGCAG', 0.018953740299680173), (831, 'ATATTT', 0.01868148693544301), (3, 'AAAAAT', 0.01840306593979321)]
(2, 5) [(633, 'AGCTGC', 0.02069472143870551), (3573, 'TCTTCC', 0.017621270065447343), (5, 'AAAACC', 0.016073442741811262)]
(2, 6) [(633, 'AGCTGC', 0.02293954892095259), (3111, 'TAAGCT', 0.019225171125558032), (3279, 'TATATT', 0.01830525524472336)]


In [9]:
ret_3 = most_predictive_kmers(3, trained_model.coef_, top_k=3)

(1, 3) [(1019, 'ATTTGT', -0.026056311960086613), (3279, 'TATATT', -0.02295469119605624), (783, 'ATAATT', -0.022446382752197416)]
(2, 3) [(963, 'ATTAAT', -0.022567449145081082), (2602, 'GGAGGG', -0.01955499411267846), (3711, 'TGCTTT', -0.018452391372267743)]
(3, 4) [(2045, 'CTTTTC', 0.018639079952260017), (2602, 'GGAGGG', 0.018455480026893012), (0, 'AAAAAA', 0.015617451284519815)]
(3, 5) [(2602, 'GGAGGG', 0.01979010961624103), (1264, 'CATTAA', 0.018392789537144502), (3410, 'TCCCAG', 0.01741456432654774)]
(3, 6) [(3711, 'TGCTTT', 0.024740015092917615), (2602, 'GGAGGG', 0.021785459910915184), (1264, 'CATTAA', 0.02051752511732844)]


In [10]:
ret_4 = most_predictive_kmers(4, trained_model.coef_, top_k=3)

(1, 4) [(4095, 'TTTTTT', -0.023522563280317266), (1160, 'CAGAGA', -0.01543167379604437), (3818, 'TGTGGG', -0.014838321730770765)]
(2, 4) [(1514, 'CCTGGG', -0.016412620308675477), (2178, 'GAGAAG', -0.015612254037728996), (4014, 'TTGGTG', -0.01430932254681098)]
(3, 4) [(3540, 'TCTCCA', -0.017853288931109587), (1314, 'CCAGAG', -0.014564549365601705), (1952, 'CTGGAA', -0.013467514295499423)]
(4, 5) [(4095, 'TTTTTT', 0.017992538418427406), (297, 'ACAGGC', 0.01719619334803904), (3933, 'TTCCTC', 0.016316037680987036)]
(4, 6) [(4095, 'TTTTTT', 0.021480272586380078), (1156, 'CAGACA', 0.01237797965718166), (1, 'AAAAAC', 0.012067607750539944)]


In [11]:
ret_5 = most_predictive_kmers(5, trained_model.coef_, top_k=3)

(1, 5) [(1246, 'CATCTG', -0.02061986417532504), (2062, 'GAAATG', -0.01818266254893662), (4046, 'TTTATG', -0.017831769808307613)]
(2, 5) [(4, 'AAAACA', -0.02008694775189509), (2062, 'GAAATG', -0.020045298355785224), (324, 'ACCACA', -0.018973582506861725)]
(3, 5) [(1952, 'CTGGAA', -0.020980321774676434), (1983, 'CTGTTT', -0.018066157338834404), (1885, 'CTCCTC', -0.01754250750449473)]
(4, 5) [(4046, 'TTTATG', -0.018657945080165238), (60, 'AAATTA', -0.018534988651231835), (1365, 'CCCCCC', -0.017861086233831716)]
(5, 6) [(2040, 'CTTTGA', 0.01813361839526416), (2032, 'CTTTAA', 0.016903150698555372), (4039, 'TTTACT', 0.016044470505381)]


In [12]:
ret_6 = most_predictive_kmers(6, trained_model.coef_, top_k = 3)

(1, 6) [(1011, 'ATTTAT', -0.018040526998526582), (2062, 'GAAATG', -0.01604349072267594), (3402, 'TCCAGG', -0.01415902274260089)]
(2, 6) [(4063, 'TTTCTT', -0.018874209916643056), (2007, 'CTTCCT', -0.018556702208293217), (160, 'AAGGAA', -0.01839560776063265)]
(3, 6) [(1495, 'CCTCCT', -0.016752769785578617), (3822, 'TGTGTG', -0.016645004053853163), (3957, 'TTCTCC', -0.016149549061544956)]
(4, 6) [(3, 'AAAAAT', -0.01402876987723059), (60, 'AAATTA', -0.013304624870057694), (0, 'AAAAAA', -0.01249074914689452)]
(5, 6) [(3839, 'TGTTTT', -0.01511840135056769), (4030, 'TTGTTG', -0.014277727038308277), (2344, 'GCAGGA', -0.013858394210009915)]


In [13]:
rets = ret_1 + ret_2 + ret_3 + ret_4 + ret_5 + ret_6

In [14]:
rets[0]

((1, 2),
 [(1084, 'CAATTA', 0.01773984944202841),
  (3959, 'TTCTCT', 0.017719768815730508),
  (2007, 'CTTCCT', 0.017686474873381423)])

In [15]:
len(rets)

30

In [16]:
predictive_kmers = [
    tup[1]
    for r in rets
    for tup in r[1]
]

In [17]:
Counter(predictive_kmers).most_common(10)

[('TTTTTT', 4),
 ('GGAGGG', 4),
 ('AGCTGC', 3),
 ('CTTTTA', 3),
 ('TATATT', 3),
 ('GAAATG', 3),
 ('AAAAAT', 2),
 ('CATTAA', 2),
 ('TGCTTT', 2),
 ('AAATTA', 2)]

In [18]:
def compute_gc_content(predictive_kmers):
    total_count = sum([
        len(kmer) for kmer in predictive_kmers
    ])
    
    gc_count = sum([
        sum([
            1 for base in kmer if base in ['G', 'C']
        ])
        for kmer in predictive_kmers
    ])
    
    return gc_count / total_count

In [19]:
compute_gc_content(predictive_kmers)

0.3277777777777778

In [20]:
def pretty_print(rets):
    twod_array = []
    for i, ret in enumerate(rets):
        if i % 5 == 0:
            twod_array.append([])
        twod_array[-1].append([
            kmer[1]
            for kmer in ret[1]
        ])
    print('            | forebrain | midbrain | hindbrain | heart  |  limb  | others |')
    print('|-----------|-----------|----------|-----------|--------|--------|--------|')
    for i, (row, name) in enumerate(zip(twod_array, [
        '| forebrain |', 
        '| midbrain  |', 
        '| hindbrain |',
        '|   heart   |',
        '|    limb   |',
        '|   others  |'])):
        row1_stuff = [row[0][0], row[1][0], row[2][0], row[3][0], row[4][0]]
        row2_stuff = [row[0][1], row[1][1], row[2][1], row[3][1], row[4][1]]
        row3_stuff = [row[0][2], row[1][0], row[2][2], row[3][2], row[4][2]]
        row1_stuff.insert(i, '      ')
        row2_stuff.insert(i, '      ')
        row3_stuff.insert(i, '      ')
        print('|           |   {}  |  {}  |  {}   | {} | {} | {} |'.format(
            row1_stuff[0], row1_stuff[1], row1_stuff[2], row1_stuff[3], row1_stuff[4], row1_stuff[5], 
        ))
        print('{}   {}  |  {}  |  {}   | {} | {} | {} |'.format(
            name, row2_stuff[0], row2_stuff[1], row2_stuff[2], row2_stuff[3], row2_stuff[4], row2_stuff[5], 
        ))
        print('|           |   {}  |  {}  |  {}   | {} | {} | {} |'.format(
            row3_stuff[0], row3_stuff[1], row3_stuff[2], row3_stuff[3], row3_stuff[4], row3_stuff[5], 
        ))
        print('|-----------|-----------|----------|-----------|--------|--------|--------|')
    return twod_array

In [22]:
pretty_print(rets)

            | forebrain | midbrain | hindbrain | heart  |  limb  | others |
|-----------|-----------|----------|-----------|--------|--------|--------|
|           |           |  CAATTA  |  CCTCTT   | AATTAA | TTGTTG | AATTAA |
| forebrain |           |  TTCTCT  |  CTTTTA   | TTAAAA | TTTCTG | CTTTTA |
|           |           |  CTTCCT  |  CCTCTT   | TAATTA | ATTTGC | TCATTA |
|-----------|-----------|----------|-----------|--------|--------|--------|
|           |   TATATT  |          |  CTTTTA   | CTGCAG | AGCTGC | AGCTGC |
| midbrain  |   TTTTTT  |          |  ATTTTT   | ATATTT | TCTTCC | TAAGCT |
|           |   AGCTGC  |          |  CTTTTA   | AAAAAT | AAAACC | TATATT |
|-----------|-----------|----------|-----------|--------|--------|--------|
|           |   ATTTGT  |  ATTAAT  |           | CTTTTC | GGAGGG | TGCTTT |
| hindbrain |   TATATT  |  GGAGGG  |           | GGAGGG | CATTAA | GGAGGG |
|           |   ATAATT  |  ATTAAT  |           | AAAAAA | TCCCAG | CATTAA |
|-----------

[[['CAATTA', 'TTCTCT', 'CTTCCT'],
  ['CCTCTT', 'CTTTTA', 'TTTGAA'],
  ['AATTAA', 'TTAAAA', 'TAATTA'],
  ['TTGTTG', 'TTTCTG', 'ATTTGC'],
  ['AATTAA', 'CTTTTA', 'TCATTA']],
 [['TATATT', 'TTTTTT', 'AGCTGC'],
  ['CTTTTA', 'ATTTTT', 'AAAAAG'],
  ['CTGCAG', 'ATATTT', 'AAAAAT'],
  ['AGCTGC', 'TCTTCC', 'AAAACC'],
  ['AGCTGC', 'TAAGCT', 'TATATT']],
 [['ATTTGT', 'TATATT', 'ATAATT'],
  ['ATTAAT', 'GGAGGG', 'TGCTTT'],
  ['CTTTTC', 'GGAGGG', 'AAAAAA'],
  ['GGAGGG', 'CATTAA', 'TCCCAG'],
  ['TGCTTT', 'GGAGGG', 'CATTAA']],
 [['TTTTTT', 'CAGAGA', 'TGTGGG'],
  ['CCTGGG', 'GAGAAG', 'TTGGTG'],
  ['TCTCCA', 'CCAGAG', 'CTGGAA'],
  ['TTTTTT', 'ACAGGC', 'TTCCTC'],
  ['TTTTTT', 'CAGACA', 'AAAAAC']],
 [['CATCTG', 'GAAATG', 'TTTATG'],
  ['AAAACA', 'GAAATG', 'ACCACA'],
  ['CTGGAA', 'CTGTTT', 'CTCCTC'],
  ['TTTATG', 'AAATTA', 'CCCCCC'],
  ['CTTTGA', 'CTTTAA', 'TTTACT']],
 [['ATTTAT', 'GAAATG', 'TCCAGG'],
  ['TTTCTT', 'CTTCCT', 'AAGGAA'],
  ['CCTCCT', 'TGTGTG', 'TTCTCC'],
  ['AAAAAT', 'AAATTA', 'AAAAAA'],
  ['TGTTT

In [87]:
print('            | forebrain | mindbrain | hindbrain | heart  |  limb  | others |')
print('|-----------|-----------|-----------|-----------|--------|--------|--------|')
print('|           |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |')
print('| forebrain |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |')
print('|           |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |')
print('|-----------|-----------|-----------|-----------|--------|--------|--------|')

            | forebrain | mindbrain | hindbrain | heart  |  limb  | others |
|-----------|-----------|-----------|-----------|--------|--------|--------|
|           |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |
| forebrain |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |
|           |   CTTCCT  |  CTTCCT   |   CTTCCT  | CTTCCT | CTTCCT | CTTCCT |
|-----------|-----------|-----------|-----------|--------|--------|--------|
