In [8]:
from massspecgym.data.datasets import MSnDataset, MassSpecDataset
from massspecgym.data.transforms import MolFingerprinter, SpecTokenizer
from massspecgym.data import MassSpecDataModule
from massspecgym.featurize import SpectrumFeaturizer


import pandas as pd

In [2]:
file_mgf = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_msn_library_pos_all_lib_MSn.mgf"
split_file = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_split.tsv"

In [3]:
config = {
    'features': ['binned_peaks'],
    'feature_attributes': {
        'binned_peaks': {
            'max_mz': 1,
            'bin_width': 1.0,
        },
    },
}

featurizer = SpectrumFeaturizer(config, mode='torch')

In [4]:
# Init hyperparameters
n_peaks = 60
fp_size = 4096
batch_size = 12

msn_dataset = MSnDataset(
    pth=file_mgf,
    featurizer=featurizer,
    mol_transform=MolFingerprinter(fp_size=fp_size),
    max_allowed_deviation=0.005
)

In [36]:
all_smiles = msn_dataset.smiles

In [37]:
len(all_smiles)

16476

In [39]:

unique_smiles = list(set(all_smiles))
print(f"Unique SMILES entries: {len(unique_smiles)}")

Unique SMILES entries: 13984


In [40]:
# Define the maximum number of SMILES per file
max_per_file = 1000

# Calculate the number of files needed
num_files = (len(unique_smiles) + max_per_file - 1) // max_per_file
print(f"Number of TSV files to create: {num_files}")

Number of TSV files to create: 14


In [41]:
import os

In [42]:
# Define the output directory
output_dir = '/Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Split the SMILES list and save each chunk
for i in range(num_files):
    start_idx = i * max_per_file
    end_idx = start_idx + max_per_file
    chunk = unique_smiles[start_idx:end_idx]
    
    # Define the filename
    filename = f'classyfire_upload_{i+1}.tsv'
    filepath = os.path.join(output_dir, filename)
    
    # Save the chunk to TSV without headers and without index
    pd.Series(chunk).to_csv(filepath, sep='\t', index=False, header=False)
    
    print(f"Saved {len(chunk)} SMILES to {filepath}")

Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_1.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_2.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_3.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_4.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_5.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_6.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_7.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_8.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_9.tsv
Saved 1000 SMILES to /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/classyfire_upload_10.tsv
Saved 100

In [43]:
# Create a DataFrame with ID and SMILES columns
df_unique = pd.DataFrame({
    'SMILES': unique_smiles
})

# Display the first few entries to verify
df_unique.head()

Unnamed: 0,SMILES
0,COC1=C(C=CC(=C1)C(=O)O)O[C@H]2[C@@H]([C@H]([C@...
1,CCCCC(=O)O[C@H](CC(=O)O)C[N+](C)(C)C
2,CCN1C=C(C(=N1)C(=O)N)NC(=S)NC2=C(C=CC(=C2)Cl)OC
3,COC1=C(C=CC(=C1)/C=N\NC2=CC=CC=C2C(=O)O)OCC3=C...
4,CCOC(=O)CSC1=NN=C(N1C)C2=CN(N=C2OC)C


In [44]:
output_file = '/Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/unique_smiles.tsv'
# df_unique.to_csv(output_file, sep='\t', index=False)

print(f"TSV file saved as {output_file}")

TSV file saved as /Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/unique_smiles.tsv


In [45]:
output_file_no_header = '/Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/unique_valid_smiles_no_header.tsv'
df_unique.to_csv(output_file_no_header, sep='\t', index=False, header=False)
print(f"TSV file saved as '{output_file_no_header}' without headers.")

TSV file saved as '/Users/macbook/CODE/Majer:MassSpecGym/data/ClassyFire/unique_valid_smiles_no_header.tsv' without headers.


In [25]:
# Read the TSV file back to verify
df_check = pd.read_csv(output_file, sep='\t')
print(df_check.head())
print(f"Total entries in TSV: {len(df_check)}")

                                              SMILES
0  COC1=C(C=CC(=C1)C(=O)O)O[C@H]2[C@@H]([C@H]([C@...
1               CCCCC(=O)O[C@H](CC(=O)O)C[N+](C)(C)C
2    CCN1C=C(C(=N1)C(=O)N)NC(=S)NC2=C(C=CC(=C2)Cl)OC
3  COC1=C(C=CC(=C1)/C=N\NC2=CC=CC=C2C(=O)O)OCC3=C...
4               CCOC(=O)CSC1=NN=C(N1C)C2=CN(N=C2OC)C
Total entries in TSV: 100


TSV file size: 0.93 MB
