In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
import sacrebleu
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


testing_file_path = 'files/Testing.xlsx'
testing_data = pd.read_excel(testing_file_path)

In [18]:


def find_nearest_neighbor_lev(query, vectorizer, X, data):
    similarities = [nltk.edit_distance(query, data['Input'].iloc[i]) for i in range(len(data))]
    nearest_neighbor_idx = np.argmin(similarities)
    nearest_neighbor_input = data['Input'].iloc[nearest_neighbor_idx]
    nearest_neighbor_output = data['Output'].iloc[nearest_neighbor_idx]
    nearest_neighbor_distance = similarities[nearest_neighbor_idx]
    return nearest_neighbor_input, nearest_neighbor_output, nearest_neighbor_distance

def nearest_neighbor_lev(testing_data, vectorizer, X, data):
    correct_predictions = 0
    total_distance_score = 0
    predicted_outputs = []

    for index, row in testing_data.iterrows():
        query = row['Input']
        actual_output = row['Output']
        _, predicted_output, distance_score = find_nearest_neighbor_lev(query, vectorizer, X, data)
        predicted_outputs.append(predicted_output)
        total_distance_score += distance_score
        if predicted_output == actual_output:
            correct_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / len(testing_data)
    # Calculate mean distance score
    mean_distance_score = total_distance_score / len(testing_data)
    
    return accuracy, mean_distance_score, predicted_outputs

def find_nearest_neighbor_euc(query, vectorizer, X, data):
    query_vector = vectorizer.transform([query])
    distances = euclidean_distances(query_vector, X)
    nearest_neighbor_idx = np.argmin(distances)
    nearest_neighbor_input = data['Input'].iloc[nearest_neighbor_idx]
    nearest_neighbor_output = data['Output'].iloc[nearest_neighbor_idx]
    nearest_neighbor_distance = distances[0, nearest_neighbor_idx]
    return nearest_neighbor_input, nearest_neighbor_output, nearest_neighbor_distance

def nearest_neighbor_euc(testing_data, vectorizer, X, data):
    correct_predictions = 0
    total_distance_score = 0
    predicted_outputs = []

    for index, row in testing_data.iterrows():
        query = row['Input']
        actual_output = row['Output']
        _, predicted_output, distance_score = find_nearest_neighbor_euc(query, vectorizer, X, data)
        predicted_outputs.append(predicted_output)
        total_distance_score += distance_score
        if predicted_output == actual_output:
            correct_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / len(testing_data)
    # Calculate mean distance score
    mean_distance_score = total_distance_score / len(testing_data)
    
    return accuracy, mean_distance_score, predicted_outputs

def find_nearest_neighbor_cos(query, vectorizer, X, data):
    # Transform the query using the same vectorizer
    query_vector = vectorizer.transform([query])
    # Compute the cosine similarity between the query and all inputs
    similarity_scores = cosine_similarity(query_vector, X)
    # Find the index of the most similar input
    nearest_neighbor_idx = np.argmax(similarity_scores)
    # Return the most similar input and its corresponding output
    nearest_input = data['Input'].iloc[nearest_neighbor_idx]
    nearest_output = data['Output'].iloc[nearest_neighbor_idx]
    return nearest_input, nearest_output, similarity_scores[0, nearest_neighbor_idx]

def nearest_neighbor_cosine(testing_data, vectorizer, X, data):
    correct_predictions = 0
    total_similarity_score = 0
    predicted_outputs = []

    for index, row in testing_data.iterrows():
        query = row['Input']
        actual_output = row['Output']
        _, predicted_output, similarity_score = find_nearest_neighbor_cos(query, vectorizer, X, data)
        predicted_outputs.append(predicted_output)
        total_similarity_score += similarity_score
        if predicted_output == actual_output:
            correct_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / len(testing_data)
    # Calculate mean similarity score
    mean_similarity_score = total_similarity_score / len(testing_data)
    
    return accuracy, mean_similarity_score, predicted_outputs


def calculate_bleu_score(references, predictions):
    references = [str(ref) for ref in references]
    predictions = [str(pred) for pred in predictions]
    bleu_score = sacrebleu.corpus_bleu(predictions, [references]).score
    return bleu_score



def find_nearest_neighbor_euc_fp(query, query_fp, data_fp):
    query_vector = query_fp
    distances = euclidean_distances([query_vector], data_fp)
    nearest_neighbor_idx = np.argmin(distances)
    nearest_neighbor_input = data['Input'].iloc[nearest_neighbor_idx]
    nearest_neighbor_output = data['Output'].iloc[nearest_neighbor_idx]
    nearest_neighbor_distance = distances[0, nearest_neighbor_idx]
    return nearest_neighbor_input, nearest_neighbor_output, nearest_neighbor_distance

def nearest_neighbor_euc_fp(testing_data, testing_fp, datafp, data):
    correct_predictions = 0
    total_distance_score = 0
    predicted_outputs = []
    for index, row in testing_data.iterrows():
        query = row['Input']
        actual_output = row['Output']
        _, predicted_output, distance_score = find_nearest_neighbor_euc_fp(query, testing_fp[index], data_fp)
        predicted_outputs.append(predicted_output)
        total_distance_score += distance_score
        if predicted_output == actual_output:
            correct_predictions += 1
    accuracy = correct_predictions / len(testing_data)
    mean_distance_score = total_distance_score / len(testing_data)

    return accuracy, mean_distance_score, predicted_outputs

In [6]:
def generate_orderings(reaction):
    # Split the reaction into reactants and products
    reactants_str, products_str = reaction.split('>>')
    # Split the reactants and products into individual chemicals
    reactants = reactants_str.split('.')
    products = products_str.split('.')
    # Generate the two possible orderings of reactants and products
    orderings = [
        '.'.join(reactants) + '>>' + '.'.join(products),
        '.'.join(reversed(reactants)) + '>>' + '.'.join(reversed(products))
    ]
    return orderings



In [9]:
# Try it out
accuracy, mean_similarity_score, predicted_outputs = nearest_neighbor_euc(testing_data, vectorizer, X, data)
print('Mean similarity score: ',mean_similarity_score)

Mean similarity score:  0.3066181108337127


In [10]:
for ftype in ['Euclidean:','Cosine:','Levenstein:']:
    print(ftype)
    for num in [50,150,500,1000]:
        file_path = f'files/{num}.xlsx'
        data = pd.read_excel(file_path)

        inputs = data['Input']
        outputs = data['Output']

        vectorizer = TfidfVectorizer(lowercase=False,analyzer='char',norm='l2')
        X = vectorizer.fit_transform(inputs)


        actual_outputs = testing_data['Output'].tolist()
        if ftype == 'Levenstein:':
            accuracy, mean_similarity_score, predicted_outputs = nearest_neighbor_lev(testing_data, vectorizer, X, data)
        if ftype == 'Euclidean:':
            accuracy, mean_similarity_score, predicted_outputs = nearest_neighbor_euc(testing_data, vectorizer, X, data)
        if ftype == 'Cosine:':
            accuracy, mean_similarity_score, predicted_outputs = nearest_neighbor_cosine(testing_data, vectorizer, X, data)
        print('Mean similarity score: ',mean_similarity_score)
        bleu_score = sacrebleu.corpus_bleu(predicted_outputs, [actual_outputs])
        print(f'BLEU score: {bleu_score.score:.4f}')
        print(f'Individual n-gram precisions: {bleu_score.precisions}')
        print()
    print()

Euclidean:
Mean similarity score:  0.3066181108337127
BLEU score: 37.6851
Individual n-gram precisions: [66.44336175395858, 49.52341483630336, 35.27295810410495, 23.562472978815393]

Mean similarity score:  0.2588266806592981
BLEU score: 38.6473
Individual n-gram precisions: [64.41260744985674, 47.69230769230769, 33.74379344587885, 22.634245187436676]

Mean similarity score:  0.2065645370996079
BLEU score: 41.5899
Individual n-gram precisions: [64.7145488029466, 49.587242026266416, 36.539196940726576, 25.516569200779728]

Mean similarity score:  0.17324876964213054
BLEU score: 45.3780
Individual n-gram precisions: [67.41530975107618, 53.3091741369445, 40.36554540151663, 29.22863374975213]


Cosine:
Mean similarity score:  0.9477122437723299
BLEU score: 37.6851
Individual n-gram precisions: [66.44336175395858, 49.52341483630336, 35.27295810410495, 23.562472978815393]

Mean similarity score:  0.9606469236595777
BLEU score: 38.6473
Individual n-gram precisions: [64.41260744985674, 47.6923