Refactor prediction step to eliminate multiprocessing

Previously, multiprocessing was utilized in the prediction step, resulting in increased memory consumption proportional to the number of worker processes. By refactoring the code to replace multiprocessing with a simple for loop, memory usage has been significantly reduced. This change not only enhances efficiency by eliminating the overhead of virtual memory but also accelerates runtime performance.
LiuzLab · Jul 9, 2024 · 6f74433 · 6f74433
1 parent 0c6e6f1
commit 6f74433
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 16 deletions.
diff --git a/run/extraModel/generate_bivar_data.py b/run/extraModel/generate_bivar_data.py
@@ -4,12 +4,11 @@
 from glob import glob
 from tqdm import tqdm, trange
 import logging
-from multiprocessing import Pool
 from os.path import exists
 import shutil
 
 def process_sample(data_folder, sample_id,
-                   default_pred, labeling=False, n_thread = 10):
+                   default_pred, labeling=False):
 
     # recessive_folder = f'{data_folder}/recessive_matrix'
     # if not os.path.exists(recessive_folder):
@@ -62,14 +61,11 @@ def process_sample(data_folder, sample_id,
                 'gene': gene,
                'varIDs': list(gene_dict[gene])} for gene in gene_dict
                ]
-    print(f"Now starting to generate recessive feature matrix for {len(gene_dict)} genes, {feature_df.shape[0]} variants using {n_thread} threads.")
-    p = Pool(processes=n_thread)
+    print(f"Now starting to generate recessive feature matrix for {len(gene_dict)} genes, {feature_df.shape[0]} variants.")
 
-    with tqdm(total=len(params)) as pbar:
-        for result in p.imap_unordered(process_gene, params):
-            pbar.update()
-    p.close()
-    p.join()
+    for param in tqdm(params):
+        process_gene(param)
+
     print("Recessive features for each gene finished, now putting together...")
 
     bivar_feature_mats = []

diff --git a/run/extraModel/main.py b/run/extraModel/main.py
@@ -16,15 +16,11 @@
 parser.add_argument('-id', metavar='I', type=str,
                     help = 'sample ID')
 
-parser.add_argument('-n_cpu',  type=int, default=10,
-                    help = 'folders containing all extended final matrices')
-
 args = parser.parse_args()
 
 #st_time = time()
 #prj_name = args.project
 sample_id = args.id
-n_cpu = args.n_cpu
 
 out_folder = '/out/conf_4Model'
 
@@ -60,7 +56,7 @@ def assign_ranking(df):
     return pred_df
 
 
-def AIM(data_folder, sample_id, n_thread):
+def AIM(data_folder, sample_id):
     feature_fn = f'/out/final_matrix/{sample_id}.csv'
 
     if not os.path.exists(feature_fn):
@@ -88,7 +84,7 @@ def AIM(data_folder, sample_id, n_thread):
     generate_bivar_data.process_sample( data_folder = out_folder,
                                         sample_id = sample_id,
                                         default_pred = default_pred,                                           
-                                        labeling=False, n_thread = n_thread)
+                                        labeling=False)
 
     recessive_feature_file = f"{out_folder}/recessive_matrix/{sample_id}.csv"
     if os.path.exists(recessive_feature_file):
@@ -117,4 +113,4 @@ def AIM(data_folder, sample_id, n_thread):
     return
 
 #for sample_id in tqdm(sample_folders):
-AIM(out_folder, sample_id, n_thread = n_cpu)
+AIM(out_folder, sample_id)