In [None]:
import os
import sys
from random import random
import gensim
import numpy as np
import pandas as pd
import custom_filtering
import molmass
import mass_spec_utils
from matplotlib import pyplot as plt
from matchms import Scores, Spectrum
from matchms.importing import load_from_json

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\Users\\User\\Data'
sys.path.insert(0, ROOT)

from matchms.importing import load_from_json

filename = os.path.join(path_data,'gnps_positive_ionmode_cleaned_by_matchms_and_lookups.json')
spectrums = load_from_json(filename)

print("number of spectra:", len(spectrums))

number_of_peaks = [len(spec.peaks) for spec in spectrums]

plt.figure(figsize=(12,7))
hist = plt.hist(number_of_peaks, np.arange(0,2000,20))
plt.xlabel("number of peaks in spectrum")
plt.ylabel("number of spectra in respective bin")

In [None]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses
from matchms.filtering import reduce_to_number_of_peaks


def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    s = reduce_to_number_of_peaks(s, 10, 1000, None)
    return s

# apply filters to the data
spectrums = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums = [s for s in spectrums if s is not None]

spectrumswithpeak = []

from custom_filtering import get_parent_peak

for spec in spectrums:
    if(get_parent_peak(spec) is not None):
        spectrumswithpeak.append(spec)
    
print(len(spectrumswithpeak))

from custom_filtering import find_chem_string


spectrums_with_inchi = []

for spec in spectrumswithpeak:
    if(find_chem_string(spec) is not None) and (type(spec.metadata['adduct']) is str):
        spectrums_with_inchi.append(spec)
         
positive_adducts = []

import csv

with open(r'C:\Users\User\Data\positive_adducts.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in spamreader:
        row = ', '.join(row)
        positive_adducts.append(row)
        
spectrums_processed = []

for spec in spectrums_with_inchi:
    if(spec.metadata['adduct'] in positive_adducts):
        spectrums_processed.append(spec)

print(len(spectrums_processed))
    

In [None]:
import glob
import json

json_data = []
a = glob.glob(r'C:\\Users\\User\\Data\\Filtering_Data\\trees\\*.json', recursive=True)

for filename in a:
    with open(filename,'r') as f:
        json_data.append(json.loads(f.read()))

In [None]:
print(json_data[1].keys())

chem_formulas = []

for mol in json_data:
    for frag in mol.get('fragments'):
        chem_formulas.append(frag.get('molecularFormula'))
        
print(len(chem_formulas))

unique_chem_formulas = []

for chem in chem_formulas:
    if(chem not in unique_chem_formulas) & (chem != ""):
        unique_chem_formulas.append(chem)
        
print(len(unique_chem_formulas))


In [None]:
adducts = []

for spec in spectrums_processed:
    if(spec.metadata['adduct'] not in adducts):
        adducts.append(spec.metadata['adduct'])
        
from molmass import Formula

print(adducts)

f = Formula(unique_chem_formulas[0])

print("For chem formula ", f.formula, " mass is ", f.mass)




In [None]:
print(spectrums_processed[0].peaks)

In [None]:
from noise_filtering import AdductAndMassLibrary

AML = AdductAndMassLibrary(adducts, unique_chem_formulas)

from matchms.similarity import CosineGreedy

noise_filtered_spectrums = []

chucked_spectrums = []

counter = 1

for spec in spectrums_processed:
    spec1 = AML.return_noise_filtered_spectrum(spec)
    if(len(spec1.peaks.mz) == 0):
        chucked_spectrums.append(spec1)
    else:
        noise_filtered_spectrums.append(spec1)
    print(counter)
    counter += 1
    

print("Processed: ", len(noise_filtered_spectrums), " chucked: ", len(chucked_spectrums))

cosine_greedy = CosineGreedy(tolerance = 0.2)

print(cosine_greedy(spectrums_processed[1], spectrums_processed[1]))
print(cosine_greedy(spectrums_processed[0], noise_filtered_spectrums[0]))

spectrums_processed[0].plot()
noise_filtered_spectrums[0].plot()

In [None]:
spectrums_processed[0].plot()
noise_filtered_spectrums[0].plot()
print(spectrums_processed[0].peaks.mz)
print(noise_filtered_spectrums[0].peaks.mz)
print(cosine_greedy(spectrums_processed[0], noise_filtered_spectrums[0]))

In [None]:
spectrums_processed[1].plot()
noise_filtered_spectrums[1].plot()
print(spectrums_processed[1].peaks.mz)
print(noise_filtered_spectrums[1].peaks.mz)
print(cosine_greedy(spectrums_processed[1], noise_filtered_spectrums[1]))

spectrums = [] 

for spec in spectrums_processed:
    if(len(spec.peaks.mz) < 6):
        spectrums.append(spec)

print(len(spectrums))
        

In [None]:
library_spectrums = []
query_spectrums = []

from custom_filtering import is_molecule_here

for spec in noise_filtered_spectrums:
        if(len(library_spectrums) == 3000) & (len(query_spectrums) == 1000):
            break   
        if(is_molecule_here(spec, library_spectrums) == 0) & (len(library_spectrums) < 3000):
            library_spectrums.append(spec)
            print("library is ", len(library_spectrums), "and query is ", len(query_spectrums))
        else:
            if(is_molecule_here(spec,query_spectrums) == 0) & (is_molecule_here(spec, library_spectrums) == 1) & (len(query_spectrums) < 1000):
                query_spectrums.append(spec)
                print("library is ", len(library_spectrums), "and query is ", len(query_spectrums))

        
          
print(len(library_spectrums))
print(len(query_spectrums))


molecule_matches = 0
for spec in query_spectrums:
    if(is_molecule_here(spec, library_spectrums) == 1):
        molecule_matches += 1

print("Molecule matches =", molecule_matches)

from custom_analysis import look_for_inchi_and_precursor

look_for_inchi_and_precursor(query_spectrums, library_spectrums)

In [None]:
from custom_fragment import FragmentPeak

all_fragments_list = []

j = 0;

for spec in library_spectrums:
    i = 0;
    for p in spec.peaks.mz:
        frag = FragmentPeak(spec.peaks.mz[i], spec.peaks.intensities[i], spec.get('spectrum_id'))
        i += 1
        all_fragments_list.append(frag)  
    print("Full spectrum turned to ", i, " fragments ")
    print(len(all_fragments_list), " frags in list")
    print(j, " of ", len(library_spectrums), " spectrums turned to fragment objects.")
    j += 1

In [None]:
print("Unsorted fragments lists:")

for i in range(20):
    all_fragments_list[i].print_fragment()
    
all_fragments_list_sorted = sorted(all_fragments_list)

print("Sorted by MZs: ")

for i in range(20):
    all_fragments_list_sorted[i].print_fragment()
    
number_of_peaks = [len(spec.peaks) for spec in spectrums]

print(spectrums[1].peaks.intensities)
print(spectrums[2].peaks.intensities)

plt.figure(figsize=(12,7))
hist = plt.hist(number_of_peaks, np.arange(0,2000,20))
plt.xlabel("number of peaks in spectrum")
plt.ylabel("number of spectra in respective bin")


In [None]:
from decoy_helpers import ispeakhere, masswithin5ppm
import random

def create_naive_decoy(s):
    print(get_parent_peak(s))
    decoy_mz = np.array([get_parent_peak(s)[0]])
    decoy_intensity = np.array([get_parent_peak(s)[1]])                        
    peaks_in_target = len(s.peaks.mz)

    random_spectrums = random.sample(library_spectrums, peaks_in_target - 1)

    for spec in random_spectrums:
        randommass =  random.choice(spec.peaks.mz)
        index = np.where(spec.peaks.mz == randommass)
        randomintensity = spec.peaks.intensities[index]
        decoy_mz = np.append(decoy_mz, [randommass])
        decoy_intensity = np.append(decoy_intensity, [randomintensity])
        


    decoy_mz = np.asarray(decoy_mz, dtype=float) 
    decoy_intensity = np.asarray(decoy_intensity, dtype=float) 

    inds  = decoy_mz.argsort()

    sorted_intensities = decoy_intensity[inds]
    sorted_mzs = decoy_mz[inds]

    decoy = Spectrum(sorted_mzs, sorted_intensities)
    
    return decoy

In [None]:
i = 1

naive_decoy_spectrums = []

for spec in library_spectrums:
    s = create_naive_decoy(spec)
    naive_decoy_spectrums.append(s)
    print(i, " naive decoy created")
    i += 1


print( "Total processed peaks = ", len(naive_decoy_spectrums))

In [None]:
from decoy_helpers import random_sample_5_peaks, get_spectrums_with_peak, return_random_pick
import time

def create_spectrum_based_decoy_bisect(s):
    start = time.time()
   # print("This spectrum has: ", len(s.peaks.mz), " peaks.")
    parentmass = get_parent_peak(s)[0]
    parentintensity = get_parent_peak(s)[1]
    decoy_mz = np.array([parentmass])
    decoy_intensities = np.array([parentintensity])
   # print("Parent peak equals: ", parentmass, "m/z, with intensity: ", parentintensity)
    peaks_in_target = len(s.peaks.mz)  
    candidate_fragments_list = []
    mass_for_loop_seeding = parentmass.copy()
    
    while(len(decoy_mz) < len(s.peaks.mz)):
    
        id_list = get_spectrums_with_peak(mass_for_loop_seeding, all_fragments_list_sorted)

        for id in id_list:
            random_peaks = random_sample_5_peaks(id, all_fragments_list)
            candidate_fragments_list.extend(random_peaks)
            
     #   print("Length of candidate frags list: ", len(candidate_fragments_list))
        drawn_ion = return_random_pick(candidate_fragments_list, decoy_mz, parentmass)

      #  print("Drew randomly:", drawn_ion.mz)
        decoy_mz = np.append(decoy_mz, drawn_ion.mz)
        decoy_intensities = np.append(decoy_intensities, drawn_ion.intensity)
        
        
     #   print("Added peak with mass ", drawn_ion.mz, "and intensity ", drawn_ion.intensity)
     #   print("Decoy mz is length ", len(decoy_mz))
        
        mass_for_loop_seeding = drawn_ion.mz       
    
   # print("Decoy masses has this number: ", len(decoy_mz))
   # print("Decoy intensities has this number: ", len(decoy_intensities)) 
               
    decoy_mz = np.asarray(decoy_mz, dtype=float) 
    decoy_intensities = np.asarray(decoy_intensities, dtype=float) 
    inds  = decoy_mz.argsort()
    sorted_intensities = decoy_intensities[inds]
    sorted_masses = decoy_mz[inds]
    decoy = Spectrum(sorted_masses, sorted_intensities) 
    
    end = time.time()
    timetaken = end-start
    times_taken.append(timetaken)
    print("Time for decoy with", len(spec.peaks.mz), " peaks: ", timetaken)
    print(numdecoys, "of ", len(library_spectrums), " created.")
    
    return decoy


In [None]:

decoyscreated = 1

times_taken = []

complex_decoy_spectrums = []

create_spectrum_based_decoy_bisect(library_spectrums[176])

failed_to_create = []


for spec in library_spectrums:
    start = time.time()
    try:
        create_spectrum_based_decoy_bisect(spec)
    except:
        failed_to_create.append(spec)
        print("failed to create")
    end = time.time()
    print("-----------------------------------------------------------------------------------------------------")
    print(" Decoy", decoyscreated, "of", len(library_spectrums), "created. Took ", end - start, "to do.")
    print("------------------------------------------------------------------------------------------------------")
    decoyscreated += 1
    end = time.time()
    timetaken = end-start
    times_taken.append(timetaken)
    print("Time for decoy with", len(spec.peaks.mz), " peaks: ", timetaken)

In [None]:
import multiprocessing

num_cores = multiprocessing.cpu_count()
print(num_cores)
