In [1]:
import pandas as pd

## Task

The best thing to do for the alignment gaps as I outlined yesterday - as I was typing I realized we already have mapped reads to S288c so we just need the bam files.  For a given gap / chunk of S288c we’re trying to assess:

1) pull the mapped reads for the aneuploid correspond to that gap we’re assessing (e.g. for a gap on Chr7, get reads for the Dis7 aneuploid)
2) We could look at the read pileup, I might need to see some before assessing if we need to get fancier - I bet for most regions, the sequence is actually there with coverage across the whole gap.  If so:
3) calculate the median reads in that chunk (normalized to the length of the chunk) and compare to the median reads from chunks (or could be all genes) that are NOT on that chromosome.  We expect that if that gap is in YPS1009 AND it’s on the chromosomes in question, the read count should be ~2X the genome sequence NOT on that chromosome (since the chunk should be duplicated in this particular aneuploids strain).


In [14]:
rpkm = pd.read_csv('James_220427_KC5T7-RPKM.results', sep = '\t', index_col=0)

In [15]:
rpkm.columns

Index(['Gasch-1240_S29', 'Gasch-1234_S23', 'Gasch-1235_S24', 'Gasch-1214_S3',
       'Gasch-1221_S10', 'Gasch-1218_S7', 'Gasch-1237_S26', 'Gasch-1222_S11',
       'Gasch-1215_S4', 'Gasch-1223_S12', 'Gasch-1220_S9', 'Gasch-1219_S8',
       'Gasch-1236_S25', 'Gasch-1224_S13', 'Gasch-1216_S5', 'Gasch-1249_S38',
       'Gasch-1225_S14', 'Gasch-1258_S47', 'Gasch-1231_S20', 'Gasch-1248_S37',
       'Gasch-1227_S16', 'Gasch-1232_S21', 'Gasch-1217_S6', 'Gasch-1259_S48',
       'Gasch-1233_S22', 'Gasch-1226_S15', 'Gasch-1250_S39', 'Gasch-1254_S43',
       'Gasch-1246_S35', 'Gasch-1230_S19', 'Gasch-1260_S49', 'Gasch-1255_S44',
       'Gasch-1228_S17', 'Gasch-1247_S36', 'Gasch-1257_S46', 'Gasch-1229_S18',
       'Gasch-1245_S34', 'Gasch-1256_S45', 'Gasch-1244_S33', 'Gasch-1212_S1',
       'Gasch-1243_S32', 'Gasch-1242_S31', 'Gasch-1251_S40', 'Gasch-1238_S27',
       'Gasch-1241_S30', 'Gasch-1252_S41', 'Gasch-1213_S2', 'Gasch-1253_S42',
       'Gasch-1239_S28'],
      dtype='object')

In [4]:
samples = pd.read_csv('YPS1009-Disomes_GenomicSeq.txt', sep = '\t')

In [8]:
dic = {}
for ind, row in samples.iterrows():
    code = row['Next-Gen Gasch #'].replace(' ', '-')
    dic[code] = row['Strain']

In [24]:
for c in list(rpkm.columns):
    cs, t = c.split('_')
    rpkm = rpkm.rename(columns={c: dic[cs]})

In [77]:
missing = pd.read_csv('missing_genes_v5.txt', sep='\t')

In [85]:
rpkm_missing_val = []
rpkm_missing = []
for ind, row in missing.iterrows():
    tmp = rpkm[rpkm.index == row['Name']]
    rpkm_missing_val.append(tmp.sum().sum())
    if tmp.sum().sum() == 0:
        rpkm_missing.append(True)
    else:
        rpkm_missing.append(False)

In [86]:
missing['rpkm sum'] = rpkm_missing_val
missing['no illumina reads'] = rpkm_missing

In [88]:
sum(rpkm_missing)

20

In [108]:
g = list(missing['Name'])
col = list(rpkm)
avg_all_but = []
avg_dis = []
for ind, row in missing.iterrows():
    res = list(filter(lambda x: 'Dis' + str(row['Chromosome']) + ' ' in x, col))
    tmp = rpkm[rpkm.index == row['Name']]
    avg_dis.append(tmp[res].mean(axis=1)[0])
    avg_all_but.append(tmp.loc[:, ~tmp.columns.isin(res)].mean(axis=1)[0])

In [109]:
missing['AVG rpmk Disome'] = avg_dis
missing['AVG rpkm all but Dis'] = avg_all_but

In [110]:
missing

Unnamed: 0.1,Unnamed: 0,seq_id,start,end,Name,gene,Alias,Note,Ontology_term,strand,phase,curie,orf_classification,display,Chromosome,Unnamed: 15,rpkm sum,no illumina reads,AVG rpmk Disome,AVG rpkm all but Dis
0,459,chrI,201467,201787,YAR047C,,,Dubious open reading frame%3B unlikely to enco...,"GO:0003674,GO:0005575,GO:0008150,SO:0000704",-,.,SGD:S000000083,Dubious,Dubious open reading frame,1,No hits,0.0,True,0.0,0.0
1,3257,chrIII,309070,310155,YCR105W,ADH7,"ADH7,ADHVII,NADP-dependent%20alcohol%20dehydro...",NADPH-dependent medium chain alcohol dehydroge...,"GO:0005575,GO:0006066,GO:0008106,SO:0000704",+,.,SGD:S000000702,Verified,NADPH-dependent medium chain alcohol dehydroge...,3,No hits,0.0,True,0.0,0.0
2,4509,chrIV,520516,520692,YDR034C-A,,,Putative protein of unknown function%3B contai...,"GO:0003674,GO:0005575,GO:0008150,SO:0000704",-,.,SGD:S000007233,Uncharacterized,Putative protein of unknown function,4,Hits,0.0,True,0.0,0.0
3,7947,chrV,449474,449578,YER138W-A,,,Putative protein of unknown function%3B YER138...,"GO:0003674,GO:0005575,GO:0008150,SO:0000704",+,.,SGD:S000007239,Uncharacterized,Putative protein of unknown function,5,Hits,56.037312,False,0.0,1.192283
4,8862,chrVI,269061,269516,YFR057W,,,Putative protein of unknown function,"GO:0003674,GO:0005575,GO:0008150,SO:0000704",+,.,SGD:S000001953,Uncharacterized,Putative protein of unknown function,6,No hits,0.0,True,,0.0
5,9420,chrVII,249869,252738,YGL137W,SEC27,"SEC27,coatomer%20subunit%20beta'",Essential beta'-coat protein of the COPI coato...,"GO:0006888,GO:0006890,GO:0008298,GO:0030126,GO...",+,.,SGD:S000003105,Verified,Essential beta'-coat protein of the COPI coatomer,7,No hits,1839.993915,False,75.560709,34.172246
6,9426,chrVII,252897,253859,YGL136C,MRM2,"MRM2,21S%20rRNA%20%28uridine2791-2'-O%29%20met...",Mitochondrial 2' O-ribose methyltransferase%3B...,"GO:0005739,GO:0008650,GO:0008650,GO:0008650,GO...",-,.,SGD:S000003104,Verified,Mitochondrial 2' O-ribose methyltransferase,7,No hits,2084.380818,False,58.191421,41.147003
7,9434,chrVII,255663,256964,YGL134W,PCL10,PCL10,Pho85p cyclin%3B recruits%2C activates%2C and ...,"GO:0000079,GO:0000307,GO:0016538,GO:0045719,GO...",+,.,SGD:S000003102,Verified,Pho85p cyclin,7,No hits,1916.747909,False,64.940069,36.821947
8,9438,chrVII,257707,261501,YGL133W,ITC1,ITC1,Subunit of ATP-dependent Isw2p-Itc1p chromatin...,"GO:0003674,GO:0005634,GO:0006348,GO:0008623,GO...",+,.,SGD:S000003101,Verified,Subunit of ATP-dependent Isw2p-Itc1p chromatin...,7,No hits,1882.690663,False,71.756721,35.459195
9,9441,chrVII,261580,261915,YGL132W,,,Dubious open reading frame%3B unlikely to enco...,"GO:0003674,GO:0005575,GO:0008150,SO:0000704",+,.,SGD:S000003100,Dubious,Dubious open reading frame,7,No hits,2015.775295,False,59.302206,39.523699


In [111]:
missing.to_excel('missing_genes_v5_illuminaReads.xlsx')

## Gene that doesn't seems on the right chromosome

In [112]:
rpkm[rpkm.index == 'YNL338W']

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Euploid wild-type,Dis4 ssd1-/ssd1-,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YNL338W,0.0,0.0,0.0,0.0,28.18245,0.0,0.0,0.0,9.249908,20.073339,...,17.45493,0.0,6.4652,15.930688,0.0,40.506276,0.0,0.0,0.0,0.0


# Chromosome 7

In [56]:
# select genes missing on chromosome 7
c7 = missing[missing['Chromosome'] == '7']
g7 = list(c7['Name'])
rpkm7 = rpkm[rpkm.index.isin(g7)]

In [68]:
rpkm7_copy = rpkm7.copy()
rpkm7_copy['avg Dis7'] = rpkm7[['Dis7 wild-type', 'Dis7 ssd1-']].mean(axis=1)
rpkm7_copy['avg al but dis7'] = rpkm7.loc[:, ~rpkm7.columns.isin(['Dis7 wild-type', 'Dis7 ssd1-'])].mean(axis=1)

In [69]:
rpkm7_copy

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-,avg Dis7,avg al but dis7
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YGL133W,41.183188,29.619028,38.016258,36.639278,31.290317,40.906661,34.801975,35.343815,39.91719,46.676463,...,35.213621,32.037684,35.615184,47.518825,29.617613,38.876403,31.078331,27.122084,71.756721,36.248272
YGL134W,32.318073,40.03801,50.466899,44.360671,29.253902,31.710785,44.137552,37.461187,35.017507,23.287848,...,41.845041,27.236338,20.533695,33.389678,37.302151,33.994458,22.318156,33.202681,64.940069,37.433211
YGL136C,41.197992,54.132388,47.466091,51.091291,37.225416,49.733574,46.064966,20.575936,29.017623,33.142895,...,59.777928,49.97571,37.016089,46.815665,37.464839,47.354123,35.499651,40.615533,58.191421,41.517534
YGL137W,28.367279,31.11662,33.169598,40.860553,36.922176,29.689846,41.158506,37.676904,40.761952,38.252115,...,25.025411,32.255168,26.701493,38.594744,38.978652,36.670495,30.088946,40.862925,75.560709,35.071995


The 4 missing genes of chromosome 7 are 2x higher values in Dis7 strains than in other strains, so gap in assembly

# Chromosome 12

In [70]:
c12 = missing[missing['Chromosome'] == '12']
g12 = list(c12['Name'])
rpkm12 = rpkm[rpkm.index.isin(g12)]
rpkm12_copy = rpkm12.copy()
rpkm12_copy['avg Dis12'] = rpkm12[['Dis12 wild-type', 'Dis12 ssd1-']].mean(axis=1)
rpkm12_copy['avg al but dis12'] = rpkm12.loc[:, ~rpkm12.columns.isin(['Dis12 wild-type', 'Dis12 ssd1-'])].mean(axis=1)

In [71]:
rpkm12_copy

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-,avg Dis12,avg al but dis12
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YLR154W-C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YLR155C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YLR157C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YLR158C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YLR160C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
real gap, no reads whatsoever

# Chromosome 14

In [72]:
c14 = missing[missing['Chromosome'] == '14']
g14 = list(c14['Name'])
rpkm14 = rpkm[rpkm.index.isin(g14)]
rpkm14_copy = rpkm14.copy()
rpkm14_copy['avg Dis14'] = rpkm14[['Dis14 wild-type', 'Dis14 ssd1-']].mean(axis=1)
rpkm14_copy['avg al but dis14'] = rpkm14.loc[:, ~rpkm14.columns.isin(['Dis14 wild-type', 'Dis14 ssd1-'])].mean(axis=1)

In [73]:
rpkm14_copy

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-,avg Dis14,avg al but dis14
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YNL323W,33.797695,39.254134,29.830722,56.700895,25.194431,37.142116,79.359897,42.849053,61.428302,51.27166,...,38.806781,30.517824,40.561606,40.091452,49.041093,47.401108,20.594075,31.41587,70.50879,39.526513
YNL325C,41.440583,39.492038,35.710817,58.341004,41.585126,38.159605,82.106758,32.908908,46.239027,38.686798,...,46.336385,53.729866,18.565882,42.692694,27.332303,36.579161,34.315657,29.630877,80.223354,37.985641
YNL326C,43.998611,33.837762,38.148053,40.20201,33.24191,26.136461,73.79466,18.091419,33.458865,22.098542,...,42.704855,20.043358,19.098435,41.407751,37.058635,22.552928,45.649181,38.687199,80.417882,36.438283
YNL327W,39.997492,36.478969,38.840939,40.374612,33.686412,38.566644,87.725186,51.684394,43.755081,52.071242,...,31.56904,41.325,28.983207,46.35675,34.624415,37.328062,35.542237,34.243584,77.510704,37.787535
YNL328C,16.356902,36.939831,29.151687,111.566696,30.483058,18.724463,91.446223,58.756088,76.705016,50.661283,...,25.640897,28.718587,23.575723,51.115062,44.052064,51.702971,42.635806,42.011555,72.640463,41.927609
YNL329C,32.650344,36.868173,33.713412,34.58083,36.218958,41.647856,73.992944,28.581868,37.089348,35.084535,...,34.56468,49.136361,28.332028,41.645638,42.172053,48.133587,34.816171,25.956703,67.255645,35.866576
YNL330C,43.398555,41.289198,38.398728,52.57561,30.97472,32.979216,86.726418,42.143835,41.795089,28.190553,...,38.686925,58.36358,27.378259,43.282916,27.710169,38.114999,49.887644,36.364841,82.362725,40.234373


# Chromosome 15

In [74]:
c15 = missing[missing['Chromosome'] == '15']
g15 = list(c15['Name'])
rpkm15 = rpkm[rpkm.index.isin(g15)]
rpkm15_copy = rpkm15.copy()
rpkm15_copy['avg Dis15'] = rpkm15[['Dis15 wild-type', 'Dis15 ssd1-']].mean(axis=1)
rpkm15_copy['avg al but dis15'] = rpkm15.loc[:, ~rpkm15.columns.isin(['Dis15 wild-type', 'Dis15 ssd1-'])].mean(axis=1)

In [75]:
rpkm15_copy

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-,avg Dis15,avg al but dis15
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YOR246C,29.056974,42.653787,50.347576,34.467998,33.844482,33.262793,39.596767,52.188187,57.763018,38.569919,...,47.619815,30.610023,34.402037,40.53687,34.935549,31.065887,25.820366,62.192333,64.314518,35.149529
YOR247W,9.496306,7.7206,18.052862,23.65591,21.237012,39.135016,22.298143,14.447413,11.617183,15.126355,...,14.615643,20.007736,39.888783,17.805484,15.34515,14.831993,8.100987,19.512476,28.56811,15.328433
YOR255W,31.599893,31.140675,29.012372,35.780623,32.121932,39.46231,38.545075,36.420598,31.628716,26.692504,...,40.52916,36.315117,70.980672,25.007907,24.868101,24.036486,18.379658,51.648615,58.68222,34.570987


Values are 2x higher in dis15 strains than in the other strains, so gap in alignement 

# Mitochondria
One gene really missing

In [55]:
cmito = missing[missing['Chromosome'] == 'mitoch']
gmito = list(cmito['Name'])
rpkm[rpkm.index.isin(gmito)]

Unnamed: 0_level_0,Dis16 wild-type,Dis13 wild-type,Dis13 ssd1-,Dis1 wild-type,Dis4 ssd1-/ssd1-,Dis3 wild-type,Dis14 ssd1-,Dis5 wild-type,Dis1 ssd1-,Dis5 ssd1-,...,Euploid wild-type,Dis4 ssd1-/ssd1-,Dis4 wild-type,Euploid ssd1-,Dis15 wild-type,Euploid CEN15 wild-type,Dis12 wild-type,Euploid ssd1-,Dis12 wild-type rDNA hemizygous clone #3,Dis15 ssd1-
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q0160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
