In [39]:
import sys
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

parent_dir = os.path.abspath(os.path.join('../..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [None]:
from pathlib import Path
from typing import List
def get_mzxml_filepaths(dir_path: str) -> List[str]:
    
    dir = Path(dir_path)
    file_paths = [str(file) for file in dir.glob('*mzXML')]
    
    return file_paths

### Spectral_db_xmls

In [None]:
paths = get_mzxml_filepaths('../../data/raw/hrms_db_sample/spectral_db_xmls')

In [None]:
from src.utils import process_mzxml_files

In [None]:
df = process_mzxml_files(paths)

In [None]:
df.to_csv('../../data/production_ready_data/hrms_df_sample_specta_extracted.csv', index=False)

In [None]:
df

In [None]:
raw_extracted_spectra_info = []
for row in tqdm(df.iterrows(), desc="Spectra info extraction"):
    for spectra in tqdm(row[1]['spectra'], desc="Row processing"):
        raw_extracted_spectra_info.append(spectra)

In [None]:
print(f"Total number of row spectra: {len(raw_extracted_spectra_info)}")

In [None]:
test_df = pd.DataFrame(raw_extracted_spectra_info)

In [None]:
columns_drop = [
    'scan_number',
    'retention_time',
    'ms_level',
    'precursor_mz'
]
test_df = test_df.drop(columns_drop, axis=1,errors='ignore')

In [None]:
test_df.to_csv('../../data/production_ready_data/test/hrms_df_sample_raw.csv',index=False)

### Vinyl_spectra

In [None]:
paths = get_mzxml_filepaths('../../data/raw/hrms_db_sample/vinyl_spectra')

In [None]:
df = process_mzxml_files(paths)

In [None]:
raw_extracted_spectra_info = []
for row in tqdm(df.iterrows(), desc="Spectra info extraction"):
    for spectra in tqdm(row[1]['spectra'], desc="Row processing"):
        raw_extracted_spectra_info.append(spectra)

In [None]:
test_df = pd.DataFrame(raw_extracted_spectra_info)

In [None]:
columns_drop = [
    'scan_number',
    'retention_time',
    'ms_level',
    'precursor_mz'
]
test_df = test_df.drop(columns_drop, axis=1,errors='ignore')

In [None]:
test_df_part_0 = pd.read_csv('../../data/production_ready_data/test/hrms_df_sample_raw.csv')

In [None]:
test_df_part_0 

In [None]:
test_df

In [None]:
result_df = pd.concat([test_df_part_0, test_df],ignore_index=True)

In [45]:
result_df

Unnamed: 0,m/z_array,intensity_array
0,[ 44.97299379 44.97485003 44.97670631 ......,[0. 0. 0. ... 0. 0. 0.]
1,[ 44.97299387 44.97485011 44.97670639 ......,[0. 0. 0. ... 0. 0. 0.]
2,[ 44.97299387 44.97485011 44.97670639 ......,[0. 0. 0. ... 0. 0. 0.]
3,[ 44.97299322 44.97484947 44.97670575 ......,[0. 0. 0. ... 0. 0. 0.]
4,[ 44.97299322 44.97484947 44.97670575 ......,[0. 0. 0. ... 0. 0. 0.]
...,...,...
4972,"[153.57228867527624, 153.5723069411731, 153.57...","[9371.453125, 24352.67578125, 46222.875, 68108..."
4973,"[153.56166406032872, 153.56167321328883, 153.5...","[180930.375, 185572.75, 196301.203125, 205487...."
4974,"[46.068288349342595, 46.06829109522729, 46.068...","[12831.28125, 87252.1796875, 159637.625, 20744..."
4975,"[153.56072049806048, 153.5607296509658, 153.56...","[58876.125, 57722.87890625, 52862.515625, 4650..."


In [47]:
import hashlib

# Create hash representations directly from the string representations
result_df['combined_hash'] = result_df.apply(
    lambda row: hashlib.md5((str(row['m/z_array']) + str(row['intensity_array'])).encode()).hexdigest(),
    axis=1
)

In [48]:
# Remove duplicates based on the combined hash
result_df = result_df.drop_duplicates(subset=['combined_hash'])

# Remove the hash column
result_df = result_df.drop(columns=['combined_hash'])

In [50]:
result_df.to_csv('../../data/production_ready_data/test/hrms_df_sample_raw.csv',index=False)