In [None]:
import subprocess
import glob
from pathlib import Path

import pandas as pd
import csv
from config import BColors, BASE_DIR
 
MODEL_FILE_PATH = BASE_DIR + '/Model/model_300dim.pkl'


def sdf2vec(SDF_FILE,CSV_FILES_DIR):
    """
    Routine takes a sdf file as input
    executes a featurization according to https://mol2vec.readthedocs.io/en/latest/
    and returns a csv file including smiles information and a 300d mol2vec representation of the compound

    * for the mapping the model_300dim.pkl is taken which relies on 19.9 million molecnding-chemical-meaning-in-300-dimensions/

    * requires RDKit for the mol2vec library: https://github.com/samoturk/mol2vec/

    * executes CLI routine: mol2vec featurize -i infile.sdf -o outfile.csv -m fullpath2model -r 1 --uncommon UNK
    """
    outfile = Path(SDF_FILE).name
    outfile = outfile.split(".")[0] 
    print(f'{BColors.WARNING}FEATURIZING SDF{BColors.ENDC}') 
    process = subprocess.call('mol2vec featurize -i ' + SDF_FILE + ' -o '  + CSV_FILES_DIR + outfile + '.csv -m ' + MODEL_FILE_PATH + ' -r 1 --uncommon UNK', shell=True,cwd="C:\\Users\\momin\\python\\project\\PubChem-ML-code\\")
    print(f'{BColors.OKGREEN}FEATURIZATION DONE{BColors.ENDC}')


def mass_featurize_sdf2vec(SDF_FILES_DIR,CSV_FILES_DIR):
    """
    Auxiliary routine to do a mapping from sdf to vec representation for all objects within a directory
    input: directory consisting only of .sdf files
    output: directory with corresponding csv files
    """
    sdf_files = glob.glob(SDF_FILES_DIR + '*.sdf')
    for sdf_file in sdf_files:
        print(f'{BColors.OKCYAN}PROCESSING FILE: ', sdf_file + f'{BColors.ENDC}')
        sdf2vec(sdf_file,CSV_FILES_DIR)


def molvec_extractor(INPUT_CSV):
    """
    Auxiliary routine which extracts 300d-vector for each compound
    in: csv file with smiles information and vector representation
    out: 300d-array / vector of compound
    """
    data = pd.read_csv(INPUT_CSV)
    vec_cols = data.filter(regex='mol2vec')
    molvec = vec_cols.values

    return molvec


def extract_vectorized_molecules(INPUT_DIR, OUTPUT_CSV_FILENAME):
    """
    Routine reads in all csv files with smiles and mol2vec information from a given directory
    returns
    * dictionary with compound id as key and vector as values
    * csv file with only the vectors which can then be used for autoenc-clustering
    """
    molvec_dict = {}

    csv_files = glob.glob(INPUT_DIR + '*.csv')
    for csv_file in csv_files:
        # get the compound id from the csv filename:
        comp_id = Path(csv_file).name
        comp_id = comp_id.split(".")[0]
        molvec = molvec_extractor(csv_file)
        molvec_dict[comp_id] = list(molvec)

    # routine to write all vectors to a csv file
    vectors = list(molvec_dict.values())

    with open(OUTPUT_CSV_FILENAME + '.csv', 'w+', newline='') as file:
        writer = csv.writer(file)
        for vector in vectors:
            for scalar in vector:
                writer.writerow(scalar)
    return molvec_dict