### Data
- Define initial lists of company names

In [1]:
list_A = ["International Business Machines", "Proctor & Gamble Co", "The Walt Disney Company"]
list_B = [
    "International Bus. Machines", "IBM", "Intl. Business Machines Corp.",
    "P&G Co.", "Procter and Gamble Company", "Proctor & Gamble Corporation",
    "P G Co.", "The Disney Co.", "Walt Disney Corporation", "Disney Inc.",
    "Walt Disney", "IBM Corporation", "International Business Machines Corp",
    "Disney Enterprises Inc.", "Procter & Gamble Inc."
]

### Similarity Metric
- I choose Jaro-Winkler similarity since it gives more attention to beginnings of strings
- I find it especially useful for company names, as thay often end with "Inc." or "Co." that rarely pose significant meaning, while beginning with the actual company name (aside from articles, which we can treat separately)

In [2]:
from difflib import SequenceMatcher

def jaro_winkler_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

### Standardization Function
- Many of common acronyms and abbreviations can be standardized for comparison.

In [3]:
import re

def standardize(name):
    name = re.sub(r"\bThe\b", "", name, flags=re.IGNORECASE).strip()      # Remove "The" at the start
    name = re.sub(r"&", "and", name, flags=re.IGNORECASE)                 # Replace "&" with "and"
    name = re.sub(r"\band\b", "&", name, flags=re.IGNORECASE)             # Replace "and" with "&" if needed
    name = re.sub(r"\bCompany\b", "Co", name, flags=re.IGNORECASE)        # "Company" to "Co"
    name = re.sub(r"\bCorporation\b", "Corp", name, flags=re.IGNORECASE)  # "Corporation" to "Corp"
    
    return name.strip()

### Applying Standardization and Similarity

In [4]:
results = {}
for name_A in list_A:
    name_A_std = standardize(name_A)
    
    similarity_scores = []
    for name_B in list_B:
        name_B_std = standardize(name_B)
        similarity_score = jaro_winkler_similarity(name_A_std, name_B_std)
        similarity_scores.append((name_B, similarity_score))
        
    top_matches = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[:3]
    results[name_A] = top_matches

### Final Results

In [5]:
for r in results.items():
    print(f"\nList A: {r[0]}")
    for match in r[1]:
        print(f"Score: {round(match[1], 3)}\t{match[0]}")


List A: International Business Machines
Score: 0.925	International Business Machines Corp
Score: 0.897	International Bus. Machines
Score: 0.733	Intl. Business Machines Corp.

List A: Proctor & Gamble Co
Score: 0.95	Proctor & Gamble Corporation
Score: 0.947	Procter and Gamble Company
Score: 0.8	Procter & Gamble Inc.

List A: The Walt Disney Company
Score: 0.933	Walt Disney Corporation
Score: 0.88	Walt Disney
Score: 0.75	The Disney Co.
