# Classificatore mediante lista

Importo le librerie necessarie per lavorare con i dati.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os.path
import re


## Carico il dataset

In [3]:
database = pd.read_excel("Database.xlsx")
database


Unnamed: 0,Compound,Class
0,"1,2,4-Benzenetricarboxylic acid, 1,2-dimethyl ester",Acid
1,"1,3-Benzenediol, 4,5-dimethyl-",Acid
2,2-Propenoic acid,Acid
3,Acetic acid,Acid
4,Butanoic acid,Acid
...,...,...
281,N-Butyl-tert-butylamine,OAR
282,Piperoxan,OAR
283,"Propanamide, N-(aminocarbonyl)-",OAR
284,Pyridine,OAR


## Carico il file di input con i dati da classificare

In [4]:
input = pd.read_excel("Input.xlsx")
input


Unnamed: 0,RT,compound name,area,% area,Match factor,formula,area/area
0,3.6318,Vinyl crotonate,3789.7,0.000108,85.1,C6H8O2,0.000279
1,41.8694,"2'-Hydroxy-4'-methoxyacetophenone, acetate",6425.2,0.000184,71.6,C11H12O4,0.000474
2,7.0294,Piperoxan,8386.0,0.000240,71.1,C14H19NO2,0.000618
3,4.1265,Toluene,14020.0,0.000401,73.4,C7H8,0.001034
4,15.3945,"3-Hexen-1-ol, acetate, (Z)-",14866.7,0.000425,80.9,C8H14O2,0.001096
...,...,...,...,...,...,...,...
407,2.1124,Methyl propionate,9189.7,0.000018,84.6,C4H8O2,0.000633
408,1.8454,Acetic acid,1731167.5,0.003334,97.9,C2H4O2,0.119330
409,1.8426,"Oxetane, 3-(1-methylethyl)-",155616.0,0.000300,72.6,C6H12O,0.010727
410,1.7403,"Acetaldehyde, hydroxy-",169419.7,0.000326,84.4,C2H4O2,0.011678


In [63]:
def get_class(database, input, save_dropped_compounds):

    # This function will eliminate compounds that do not meet the area and match factor or elemental composition criteria and classify the remaining compounds based on the database provided.
    # This feature was written by M.Griot and you can find the original version at https://github.com/MGriot/Utility
    #
    # database: database su cui si basa la classificazione
    # input: dati di input
    # output: dati classificati in output
    #


    #message when you start the function
    print("Starting the classification process...")
    #liste vuote usate nella classificazione
    compounds = []  # lista di tutti i composti
    class_compounds = []  # lista di tutte le classi dei composti

    
    a=0
    no_match_value=[]
    errate_formulae=[]
    
    #parte per eliminare le righe
    print("I start removing compounds in the input that don't suit the parameters.")
    for i in tqdm(input["compound name"]):
        #removing compound with errate number for area and match factor
        if input["area"][a]<1e5 or input["Match factor"][a]<70:
            no_match_value.append(a)
            
        #removing all numbers from the string
        s = input["formula"][a]
        try:
            result = re.sub(r'[0-9]+', '', s)
            #split the string into a list of elements
            result = re.findall('[A-Z][^A-Z]*', result)
        except:
            result = ""
        for i in result:
            if "F" in i:
                errate_formulae.append(a)
            elif "Cl" in i:
                errate_formulae.append(a)
            elif "S" in i:
                errate_formulae.append(a)
            elif "P" in i:
                errate_formulae.append(a)
            elif "Si" in i:
                errate_formulae.append(a)
        a+=1     
    #elimino tutte le righe che non passano i test per i valori e la formula
    dropped_rows = np.unique(no_match_value + errate_formulae)

    if save_dropped_compounds == True: #creo un file dei composti cancellati se metti True, se metti False ti mostro solo l'id dei composti cancellati
        #creo un nuovo database con i composti eliminati
        dropped_database = input.loc(0)[dropped_rows]
        dropped_names=dropped_database["compound name"]
        dropped_rt=dropped_database["RT"]
        dropped_area=dropped_database["area"]
        dropped_area_=dropped_database["% area"]
        dropped_Match_factor=dropped_database["Match factor"]
        dropped_formula=dropped_database["formula"]
        dropped_area_area=dropped_database["area/area"]

        dropped_compounds = pd.DataFrame(list(zip(dropped_rt, dropped_names, dropped_area, dropped_area_, dropped_Match_factor, dropped_formula, dropped_area_area)),
                          columns=["RT", "compound name", "area", "% area", "Match factor", "formula", "area/area"])
    else:
        print(f"Eliminated rows in input: {dropped_rows}")
        print("Remember, to find the compounds in the input file you need to add +1 to the previous numbers.\n")
    
    # controllo se esiste già un file di Dropped_compounds
    if os.path.isfile('Dropped_compounds.xlsx'):
        print("File \"Dropped_compounds.xlsx\" alredy exists in this directory.\nIf you want to obtain a new one, delete it.")
    else:
        print("File \"Dropped_compounds.xlsx\" will be saved in this directory.")
        # converto il dataframe in un file excel
        dropped_compounds.to_excel("Dropped_compounds.xlsx")

    #creo il nuovo database con le righe ridotte    
    new=input.drop(dropped_rows)
        
    #creo una lista di tutti i composti con la classe di appartenenza
    print("\nI begin to classify the input compounds based on the database.")
    for i in tqdm(new["compound name"]):  # leggo i composti dall'input
        # aggiungo il nome del i composto nella lista dei composti
        compounds.append(i)
        # controllo se il composto i è presente nel database, se si aggiungo la classe conosciuta
        if i in database["Compound"].to_numpy():
            class_compounds.append(
                database["Class"][database[database["Compound"] == i].index[0]])
        else:  # se no, classifica il composto come sconosciuto
            class_compounds.append("Unknown")
    
    rt=new["RT"]
    area=new["area"]
    area_=new["% area"]
    Match_factor=new["Match factor"]
    formula=new["formula"]
    area_area=new["area/area"]
    
    classified_compound = pd.DataFrame(list(zip(rt, compounds, class_compounds, area, area_, Match_factor, formula, area_area)),
                          columns=["RT", "compound name", "Class", "area", "% area", "Match factor", "formula", "area/area"])  # creo il dataframe con i dati classificati

    print("Report of your input after the classification:")
    # print della descrizione dei composti classificati
    print(classified_compound["Class"].value_counts())

    if os.path.isfile('Classified_compounds.xlsx'):  # controllo se esiste già un file di classified_compound
        print("\nFile \"Classified_compounds.xlsx\" alredy exists in this directory.\nIf you want to obtain a new one, delete it.")
    else:
        print("\nFile \"Classified_compounds.xlsx\" will be saved in this directory.")
        # converto il dataframe in un file excel
        classified_compound.to_excel("Classified_compounds.xlsx")

    print("\nSee you next time ;)")



In [64]:
get_class(database=database, input=input, save_dropped_compounds=True)


Starting the classification process...
I start removing compounds in the input that don't suit the parameters.


100%|██████████| 412/412 [00:00<00:00, 45732.63it/s]


File "Dropped_compounds.xlsx" alredy exists in this directory.
If you want to obtain a new one, delete it.

I begin to classify the input compounds based on the database.


100%|██████████| 228/228 [00:00<00:00, 4749.83it/s]

Report of your input after the classification:
Unknown                 88
 Ketones and Etheres    49
Alkyl Phenols           28
Guaiacols               17
Aromatics               12
OAR                      9
Acid                     9
Syringol                 8
Furans                   3
Catechol                 3
Sugars                   1
Methoxybenzenes          1
Name: Class, dtype: int64


File "Output.xlsx" alredy exists in this directory.
If you want to obtain a new one, delete it.

See you next time ;)





# Faster and easy method

In [1]:
import pandas as pd

from get_class import get_class 

In [2]:
database = pd.read_excel("Database.xlsx")
input = pd.read_excel("Input.xlsx")


In [3]:
get_class(database=database, input=input, save_dropped_compounds=True)


Starting the classification process...
I start removing compounds in the input that don't suit the parameters.


100%|██████████| 412/412 [00:00<00:00, 34318.77it/s]


File "Dropped_compounds.xlsx" alredy exists in this directory.
If you want to obtain a new one, delete it.

I begin to classify the input compounds based on the database.


100%|██████████| 228/228 [00:00<00:00, 3123.36it/s]

Report of your input after the classification:
Unknown                 88
 Ketones and Etheres    49
Alkyl Phenols           28
Guaiacols               17
Aromatics               12
OAR                      9
Acid                     9
Syringol                 8
Furans                   3
Catechol                 3
Sugars                   1
Methoxybenzenes          1
Name: Class, dtype: int64

File "Classified_compounds.xlsx" alredy exists in this directory.
If you want to obtain a new one, delete it.

See you next time ;)



