In [58]:
import pandas as pd
from difflib import SequenceMatcher

# Read the data into dataframes
M1 = pd.read_csv("./data/MCPMaser2023_rudmingb.csv")
M2 = pd.read_csv("./data/masers_with_lum_2018.csv")
M3 = M1.copy(deep=True)

M3["lum_from_2018_paper"] = 0
M3["Dist_from_2018_paper"] = 0
M3["Name_in_2018_paper_db"] = ""


# Define the similarity calculation function
def calculate_similarity(string1, string2):
    matcher = SequenceMatcher(None, string1, string2)
    similarity = matcher.ratio() * 100
    return similarity


#some sanity checks / statistics
matched_count = 0
unmatched_count = 0
compare_similarity = 100 
compare_msg = ""
compare2sim = 0
compare2msg = ""

# Iterate over each entry in M1.Source_Name column
for M1index, source_name in enumerate(M1['Source_Name']):
    max_similarity = 0
    matching_entry = None
    passes_test = False
    M2idx = 0
    final_min_sim = 0
    final_avg_len = 0
    
    # Compare the source_name with each entry in M2.Galaxy column
    for M2IDx, galaxy in enumerate(M2['Galaxy']):

        # Calculate the minimum similarity threshold based on string lengths, and a graph determined by datapoints. See the desmos link.
        lengthy = (len(source_name) + len(galaxy))/2   
        min_similarity_threshold = (114/(lengthy+2.4))+55  # Adjust the multiplier as needed: https://www.desmos.com/calculator/aj8aa5qwar
        
        
        similarity = calculate_similarity(source_name, galaxy)
        if similarity > max_similarity and similarity >= min_similarity_threshold:
            max_similarity = similarity
            matching_entry = galaxy
            final_min_sim = min_similarity_threshold
            final_avg_len = lengthy
            passes_test = True
            M2idx = M2IDx


    # Check if a matching entry was found
    if passes_test:
        
        ### YAY! the name matches!
        
        msg = "\n"
        msg = msg+f"\nM1.Source_Name: {source_name}"
        msg = msg+f"\nMatching entry in M2.Galaxy: {matching_entry}"
        msg = msg+f"\nHighest similarity: {max_similarity}"
        msg = msg+f"\nWith Min similarity: {final_min_sim}"
        msg = msg+f"\nAnd avg len: {final_avg_len}"
        msg = msg+"\n----------------------------------"
        print(msg)
        matched_count +=1
        if max_similarity < compare_similarity:
            compare_similarity = max_similarity
            compare_msg = msg
        
        M3.loc[M1index, "lum_from_2018_paper"] = M2.loc[M2idx, "Luminosity"]
        M3.loc[M1index, "Dist_from_2018_paper"] = M2.loc[M2idx, "Distance"]
        M3.loc[M1index, "Name_in_2018_paper_db"] = M2.loc[M2idx, "Galaxy"]
    else:
        unmatched_count +=1

# print(f"Number of Matches: {matched_count}")
# print(f"Number Left unmatched: {unmatched_count}")
# print(f"Lowest datapoint matched: {compare_msg}")
# print(f"Highest datapoint left unmatched: {compare2msg}")

M3.to_csv("./cleansed_data/MCPMasers2023_combined_with_2018_lum.csv", index=False)




M1.Source_Name: NGC_23
Matching entry in M2.Galaxy: NGC23
Highest similarity: 90.9090909090909
With Min similarity: 69.43037974683544
And avg len: 5.5
----------------------------------


M1.Source_Name: NGC_17
Matching entry in M2.Galaxy: NGC17
Highest similarity: 90.9090909090909
With Min similarity: 69.43037974683544
And avg len: 5.5
----------------------------------


M1.Source_Name: J0011-0054
Matching entry in M2.Galaxy: 2MASXJ00114518-0054303
Highest similarity: 62.5
With Min similarity: 61.19565217391305
And avg len: 16.0
----------------------------------


M1.Source_Name: J0027+4544
Matching entry in M2.Galaxy: 2MASXJ00272528+4544279
Highest similarity: 62.5
With Min similarity: 61.19565217391305
And avg len: 16.0
----------------------------------


M1.Source_Name: IC10
Matching entry in M2.Galaxy: IC10
Highest similarity: 100.0
With Min similarity: 72.8125
And avg len: 4.0
----------------------------------


M1.Source_Name: NGC_235A
Matching entry in M2.Galaxy: NGC235A
