Skip to content

Commit

Permalink
bugs fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
ploy-np committed Sep 14, 2020
1 parent 9c519b4 commit a3aabf0
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 35 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
name=__pkg_name__,
license="MIT",
description='xpore is a python package for Nanopore data analysis.',
version='v0.5.5',
version='v0.5.6',
long_description=README,
long_description_content_type='text/markdown',
url='https://github.com/GoekeLab/xpore',
Expand Down
73 changes: 39 additions & 34 deletions xpore/scripts/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,49 +41,54 @@ def combine(read_name,eventalign_per_read,out_paths,locks):
eventalign_result = pd.DataFrame.from_records(eventalign_per_read)

cond_successfully_eventaligned = eventalign_result['reference_kmer'] == eventalign_result['model_kmer']
eventalign_result = eventalign_result[cond_successfully_eventaligned]

if cond_successfully_eventaligned.sum() != 0:

keys = ['read_index','contig','position','reference_kmer'] # for groupby
eventalign_result['length'] = pd.to_numeric(eventalign_result['end_idx'])-pd.to_numeric(eventalign_result['start_idx'])
eventalign_result['sum_norm_mean'] = pd.to_numeric(eventalign_result['event_level_mean']) * eventalign_result['length']
eventalign_result = eventalign_result[cond_successfully_eventaligned]

eventalign_result = eventalign_result.groupby(keys)
sum_norm_mean = eventalign_result['sum_norm_mean'].sum()
start_idx = eventalign_result['start_idx'].min()
end_idx = eventalign_result['end_idx'].max()
total_length = eventalign_result['length'].sum()
keys = ['read_index','contig','position','reference_kmer'] # for groupby
eventalign_result['length'] = pd.to_numeric(eventalign_result['end_idx'])-pd.to_numeric(eventalign_result['start_idx'])
eventalign_result['sum_norm_mean'] = pd.to_numeric(eventalign_result['event_level_mean']) * eventalign_result['length']

eventalign_result = pd.concat([start_idx,end_idx],axis=1)
eventalign_result['norm_mean'] = sum_norm_mean/total_length
eventalign_result = eventalign_result.groupby(keys)
sum_norm_mean = eventalign_result['sum_norm_mean'].sum()
start_idx = eventalign_result['start_idx'].min()
end_idx = eventalign_result['end_idx'].max()
total_length = eventalign_result['length'].sum()

eventalign_result.reset_index(inplace=True)
eventalign_result = pd.concat([start_idx,end_idx],axis=1)
eventalign_result['norm_mean'] = sum_norm_mean/total_length

# eventalign_result['transcript_id'] = [contig.split('.')[0] for contig in eventalign_result['contig']]
eventalign_result['transcript_id'] = eventalign_result['contig']
eventalign_result['transcriptomic_position'] = pd.to_numeric(eventalign_result['position']) + 2 # the middle position of 5-mers.
# eventalign_result = misc.str_encode(eventalign_result)
eventalign_result['read_id'] = [read_name]*len(eventalign_result)
eventalign_result.reset_index(inplace=True)

# features = ['read_id','transcript_id','transcriptomic_position','reference_kmer','norm_mean','start_idx','end_idx']
# features_dtype = np.dtype([('read_id', 'S36'), ('transcript_id', 'S15'), ('transcriptomic_position', '<i8'), ('reference_kmer', 'S5'), ('norm_mean', '<f8'), ('start_idx', '<i8'), ('end_idx', '<i8')])
features = ['read_id','transcript_id','transcriptomic_position','reference_kmer','norm_mean']
# eventalign_result['transcript_id'] = [contig.split('.')[0] for contig in eventalign_result['contig']]
eventalign_result['transcript_id'] = eventalign_result['contig']

df_events_per_read = eventalign_result[features]
# print(df_events_per_read.head())

# write to file.
df_events_per_read = df_events_per_read.set_index(['transcript_id','read_id'])

with locks['hdf5'], h5py.File(out_paths['hdf5'],'a') as hf:
for tx_id,read_id in df_events_per_read.index.unique():
df2write = df_events_per_read.loc[[(tx_id,read_id)],:].reset_index()
events = np.rec.fromrecords(misc.str_encode(df2write[features]),names=features) #,dtype=features_dtype
eventalign_result['transcriptomic_position'] = pd.to_numeric(eventalign_result['position']) + 2 # the middle position of 5-mers.
# eventalign_result = misc.str_encode(eventalign_result)
eventalign_result['read_id'] = [read_name]*len(eventalign_result)

hf_tx = hf.require_group('%s/%s' %(tx_id,read_id))
if 'events' in hf_tx:
continue
else:
hf_tx['events'] = events
# features = ['read_id','transcript_id','transcriptomic_position','reference_kmer','norm_mean','start_idx','end_idx']
# features_dtype = np.dtype([('read_id', 'S36'), ('transcript_id', 'S15'), ('transcriptomic_position', '<i8'), ('reference_kmer', 'S5'), ('norm_mean', '<f8'), ('start_idx', '<i8'), ('end_idx', '<i8')])
features = ['read_id','transcript_id','transcriptomic_position','reference_kmer','norm_mean']

df_events_per_read = eventalign_result[features]
# print(df_events_per_read.head())

# write to file.
df_events_per_read = df_events_per_read.set_index(['transcript_id','read_id'])

with locks['hdf5'], h5py.File(out_paths['hdf5'],'a') as hf:
for tx_id,read_id in df_events_per_read.index.unique():
df2write = df_events_per_read.loc[[(tx_id,read_id)],:].reset_index()
events = np.rec.fromrecords(misc.str_encode(df2write[features]),names=features) #,dtype=features_dtype

hf_tx = hf.require_group('%s/%s' %(tx_id,read_id))
if 'events' in hf_tx:
continue
else:
hf_tx['events'] = events

with locks['log'], open(out_paths['log'],'a') as f:
f.write('%s\n' %(read_name))
Expand Down

0 comments on commit a3aabf0

Please sign in to comment.