In [6]:
import pandas as pd
import json

In [7]:
df = pd.read_csv('Similarity_SMILES.csv')
similar_properties = df.merge(df[['CAN_SMILES', 'KS_gap']], left_on='Most Similar SMILES', right_on='CAN_SMILES', suffixes=('', '_similar'))

# Adding new columns to the original DataFrame
df['similar_gap'] = similar_properties['KS_gap_similar']
df.head()

Unnamed: 0,E_homo,E_lumo,KS_gap,CAN_SMILES,MW,Highest Tanimoto Score,Most Similar SMILES,similar_homo,similar_lumo,similar_gap
0,-5.17017,-2.28848,2.88169,C#Cc1[nH]ccc1c1csc2-c3c(C(=O)c12)ccs3,281.352,0.733333,CC#Cc1[nH]ccc1c1csc2-c3c(C(=O)c12)ccs3,-4.98513,-2.22317,2.76196
1,-5.29262,-2.47624,2.81638,O=C1c2c(-c3c1ccs3)scc2c1c2sccc2cc2c1ccs2,380.526,0.73913,Cc1csc2-c3c(C(=O)c12)c(cs3)c1c2sccc2cc2c1ccs2,-5.27357,-2.43542,2.83815
2,-5.30622,-2.40005,2.90617,Cc1sc2-c3sccc3C(=O)c2c1C1=CC(=C)c2c1csc2,338.466,0.777778,Cc1sc2-c3sccc3C(=O)c2c1C1=CC(=O)c2c1csc2,-5.56473,-2.5878,2.97693
3,-5.99195,-2.91162,3.08033,O=N(=O)c1cc2c(s1)c(sc2C(F)(F)F)c1csc2-c3c(C(=O...,443.463,0.714286,O=N(=O)c1cc2c(sc(c2cc1N(=O)=O)C(F)(F)F)c1csc2-...,-6.16882,-3.12659,3.04223
4,-5.31983,-2.44086,2.87897,Cc1cc2c(-c3c(C2=O)c(cs3)c2cc(ccc2N(=O)=O)c2csc...,409.501,0.759259,Cc1csc2-c3c(C(=O)c12)c(cs3)c1cc(ccc1N(=O)=O)c1...,-6.16882,-3.12659,3.04223


In [22]:
def create_diverse_templates(row):
    templates = []
    # Handle cases where HOMO or LUMO values need adjustment
    if row['E_homo'] < row['similar_homo']:
        templates.append({
            "input": f"I have a compound represented by this SMILES code {row['CAN_SMILES']} and its HOMO value is currently at {row['E_homo']}. Give me similar SMILES elevating its homo value.",
            "output": f"Explore the compound with SMILES {row['Most Similar SMILES']} which exhibits an augmented HOMO value of {row['similar_homo']} where we increased the homo value by {row['similar_homo'] - row['E_homo']}."
        })
    elif row['E_homo'] > row['similar_homo']:
        templates.append({
            "input": f"I possess a compound represented by this SMILES code {row['CAN_SMILES']} and its HOMO value from {row['E_homo']}. Recommend adjustments SMILES to diminish homo value.",
            "output": f"Examine the compound with SMILES {row['Most Similar SMILES']} showcasing a reduced HOMO value of {row['similar_homo']} where we reduced the homo value by {row['E_homo'] - row['similar_homo']}."
        })

    if row['E_lumo'] < row['similar_lumo']:
        templates.append({
            "input": f"I possess a compound represented by this SMILES code {row['CAN_SMILES']}. Propose changes to boost its LUMO value currently at {row['E_lumo']}.",
            "output": f"Explore the compound with SMILES {row['Most Similar SMILES']} which exhibits an augmented LUMO value of {row['similar_lumo']}."
        })
    elif row['E_lumo'] > row['similar_lumo']:
        templates.append({
            "input": f"I know a compound represented by this SMILES code {row['CAN_SMILES']} and its LUMO value is {row['E_lumo']}. Propose similar SMILES decreasing LUMO value.",
            "output": f"Examine the compound with SMILES {row['Most Similar SMILES']} showcasing a reduced LUMO value of {row['similar_lumo']} where we reduced lumo level by {row['E_lumo'] - row['similar_lumo']}."
        })

    # Handle complex scenarios where both values change
    if (row['E_homo'] < row['similar_homo']) and (row['E_lumo'] < row['similar_lumo']):
        templates.append({
            "input": f"Propose a compound similar to SMILES {row['CAN_SMILES']} with elevated HOMO and LUMO values.",
            "output": f"Consider the compound {row['Most Similar SMILES']} with enhanced HOMO value of ({row['similar_homo']}) and LUMO value of ({row['similar_lumo']}) levels."
        })
    if (row['E_homo'] > row['similar_homo']) and (row['E_lumo'] > row['similar_lumo']):
        templates.append({
            "input": f"Recommend a compound similar to {row['CAN_SMILES']} with lowered HOMO and LUMO values.",
            "output": f"Examine the compound {row['Most Similar SMILES']} with diminished HOMO value of ({row['similar_homo']}) and LUMO value of ({row['similar_lumo']}) levels."
        })
    if (row['E_homo'] < row['similar_homo']) and (row['E_lumo'] > row['similar_lumo']):
        templates.append({
            "input": f"Propose a compound similar to {row['CAN_SMILES']} with an elevated HOMO and a lowered LUMO.",
            "output": f"Consider the compound {row['Most Similar SMILES']} with an enhanced HOMO ({row['similar_homo']}) and diminished LUMO ({row['similar_lumo']})."
        })
    if (row['E_homo'] > row['similar_homo']) and (row['E_lumo'] < row['similar_lumo']):
        templates.append({
            "input": f"Recommend a compound similar to {row['CAN_SMILES']} with a lowered HOMO and an elevated LUMO.",
            "output": f"Examine the compound {row['Most Similar SMILES']} with a diminished HOMO value of ({row['similar_homo']}) and enhanced LUMO value of ({row['similar_lumo']})."
        })

    templates.append({
    "input": f"Identify a SMILES structure with a HOMO-LUMO gap similar to that of the molecule represented by {row['CAN_SMILES']}.",
    "output": f"The molecular gap between HOMO and LUMO for the molecule with SMILES {row['CAN_SMILES']} is {row['KS_gap']} eV. A closely related structure with a comparable gap of {row['similar_gap']} eV is represented by the SMILES {row['Most Similar SMILES']}."
})

    return templates

json_data = [item for sublist in df.apply(create_diverse_templates, axis=1) for item in sublist]

# Save to JSON file
import json
with open('prompt_dataset.jsonl', 'w') as f:
    json.dump(json_data, f, indent=4)

print("Conditional JSON templates with diversified language have been created and updated successfully.")


Conditional JSON templates with diversified language have been created and updated successfully.


In [11]:
print(df.head(1)['text'])

0    <s>[INST] Me gradué hace poco de la carrera de...
Name: text, dtype: object


In [24]:
import json
file_path = 'prompt_dataset_v2.jsonl'
with open(file_path, 'r') as file:
    data = json.load(file)
number_of_prompts = len(data)

print(f'There are {number_of_prompts} prompts in the JSON file.')

There are 376811 prompts in the JSON file.


In [20]:
def formatting_func(example):
    text = f"### Question: {example['input']}\n ### Answer: {example['output']}"
    return text
formatting_func(data[0])

'### Question: I possess a compound represented by this SMILES code C#Cc1[nH]ccc1c1csc2-c3c(C(=O)c12)ccs3. Propose changes to elevate its HOMO value currently at -5.17017.\n ### Answer: Explore the compound with SMILES CC#Cc1[nH]ccc1c1csc2-c3c(C(=O)c12)ccs3 which exhibits an augmented HOMO value of -4.98513.'

In [25]:
# Plotting function
def plot_data_lengths(tokenized_data):
    lengths = [len(x['input_ids']) for x in tokenized_data]

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')

    file_path = os.path.join(save_dir, 'input_ids_length_distribution.png')
    plt.show()
plot_data_lengths(tokenized_data)

Plot saved to ./plots\input_ids_length_distribution.png
