In [46]:
import re
import unicodedata
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from thefuzz import fuzz

In [47]:
def normalize_company_name(name: str) -> str:
    # Lowercase the name first
    name = name.lower()

    # Remove accents and special characters
    name = ''.join(
        c for c in unicodedata.normalize('NFD', name)
        if unicodedata.category(c) != 'Mn'
    )

    # List of common company add-ons to remove (extendable)
    common_addons = [
        r'\bag\b', r'\bgmbh\b', r'\binc\b', r'\bltd\b', r'\bllc\b',
        r'\bs\.?a\.?\b', r'\bs\.?a\.?r\.?l\.?\b', r'\bkg\b', r'\bsagl\b',
        r'\bklg\b',
        r'\bco\b', r'\bsrl\b', r'\bspa\b', r'\bsnc\b', r'\bohg\b',
        r'\bbv\b', r'\bnv\b', r'\bse\b', r'\bplc\b', r'\bcorp\b', 
        r'\bcorporation\b', r'\bholding\b', r'\bholdings\b'
    ]

    # Remove common add-ons
    for addon in common_addons:
        name = re.sub(addon, '', name)

    # Replace common symbols with spaces
    name = re.sub(r'[&/]', ' ', name)

    # Remove punctuation except for allowed characters (alphanumeric and whitespace)
    name = re.sub(r'[^\w\s]', ' ', name)

    # Remove extra whitespace
    name = re.sub(r'\s+', ' ', name)

    return name


def calculate_similarity_core(name1, name2):

    tfidf = TfidfVectorizer(analyzer='char')
    tfidf_matrix = tfidf.fit_transform([name1, name2])
    tfidf_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

    # Levenshtein ratio
    lev_ratio = fuzz.ratio(name1, name2) / 100
    
    # Partial ratio
    part_ratio = fuzz.partial_ratio(name1, name2) / 100
    
    # Combine scores with weights
    combined_score = (
        0.2 * tfidf_sim +
        0.2 * lev_ratio +
        0.6 * part_ratio
    )
    
    return combined_score

In [48]:
merger_df = pd.read_csv('./merger_relation.csv', sep=';')

In [49]:
merger_df['name_acquirer_norm'] = merger_df.name_acquirer.apply(normalize_company_name)
merger_df['name_acquiree_norm'] = merger_df.name_acquiree.apply(normalize_company_name)

merger_df['score'] = merger_df.apply(lambda x: calculate_similarity_core(x.name_acquirer_norm, x.name_acquiree_norm), axis=1)

In [50]:
merger_df.sort_values('score', ascending=False).to_excel('merger_score.xlsx', index=False)