Skip to content

Commit

Permalink
Refactor prediction step to eliminate multiprocessing
Browse files Browse the repository at this point in the history
Previously, multiprocessing was utilized in the
prediction step, resulting in increased memory
consumption proportional
to the number of worker processes.
By refactoring the code to replace multiprocessing with a simple for loop,
memory usage has been significantly reduced.
This change not only enhances efficiency
by eliminating the overhead of virtual memory
but also accelerates runtime performance.
  • Loading branch information
jylee-bcm committed Jul 9, 2024
1 parent 0c6e6f1 commit 6f74433
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 16 deletions.
14 changes: 5 additions & 9 deletions run/extraModel/generate_bivar_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from glob import glob
from tqdm import tqdm, trange
import logging
from multiprocessing import Pool
from os.path import exists
import shutil

def process_sample(data_folder, sample_id,
default_pred, labeling=False, n_thread = 10):
default_pred, labeling=False):

# recessive_folder = f'{data_folder}/recessive_matrix'
# if not os.path.exists(recessive_folder):
Expand Down Expand Up @@ -62,14 +61,11 @@ def process_sample(data_folder, sample_id,
'gene': gene,
'varIDs': list(gene_dict[gene])} for gene in gene_dict
]
print(f"Now starting to generate recessive feature matrix for {len(gene_dict)} genes, {feature_df.shape[0]} variants using {n_thread} threads.")
p = Pool(processes=n_thread)
print(f"Now starting to generate recessive feature matrix for {len(gene_dict)} genes, {feature_df.shape[0]} variants.")

with tqdm(total=len(params)) as pbar:
for result in p.imap_unordered(process_gene, params):
pbar.update()
p.close()
p.join()
for param in tqdm(params):
process_gene(param)

print("Recessive features for each gene finished, now putting together...")

bivar_feature_mats = []
Expand Down
10 changes: 3 additions & 7 deletions run/extraModel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,11 @@
parser.add_argument('-id', metavar='I', type=str,
help = 'sample ID')

parser.add_argument('-n_cpu', type=int, default=10,
help = 'folders containing all extended final matrices')

args = parser.parse_args()

#st_time = time()
#prj_name = args.project
sample_id = args.id
n_cpu = args.n_cpu

out_folder = '/out/conf_4Model'

Expand Down Expand Up @@ -60,7 +56,7 @@ def assign_ranking(df):
return pred_df


def AIM(data_folder, sample_id, n_thread):
def AIM(data_folder, sample_id):
feature_fn = f'/out/final_matrix/{sample_id}.csv'

if not os.path.exists(feature_fn):
Expand Down Expand Up @@ -88,7 +84,7 @@ def AIM(data_folder, sample_id, n_thread):
generate_bivar_data.process_sample( data_folder = out_folder,
sample_id = sample_id,
default_pred = default_pred,
labeling=False, n_thread = n_thread)
labeling=False)

recessive_feature_file = f"{out_folder}/recessive_matrix/{sample_id}.csv"
if os.path.exists(recessive_feature_file):
Expand Down Expand Up @@ -117,4 +113,4 @@ def AIM(data_folder, sample_id, n_thread):
return

#for sample_id in tqdm(sample_folders):
AIM(out_folder, sample_id, n_thread = n_cpu)
AIM(out_folder, sample_id)

0 comments on commit 6f74433

Please sign in to comment.