In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
# prepered training dataset with 620+ entries => "training_dataset.csv"
df = pd.read_csv('../input/training_dataset.csv')
df

Unnamed: 0,Company Name,Similar Name,Same/Different,Category,Country same or not,Parent same or not,Unnamed: 6
0,HARRIS COUNTY CSCD,HARRIS COUNTY,same,one word missing,same,,
1,HARRIS COUNTY CSCD,HARRIS COUNTY ITC,same,last word different,same,,
2,RICHLAND COUNTY EMERGENCY SERVICES,LANDER COUNTY EMERGENCY SERVICES,different,first word different,same,,
3,CS RECURSOS GEOTERMICO ICE,CS RECURSOS GEOTAARMICO ICE,same,mispell,same,,
4,HARRIS COUNTY ITC,HARRIS COUNTY,same,one word missing,same,,
...,...,...,...,...,...,...,...
616,CHIFENG JILONG GOLD MINING,SHANGHAI AUTOMOTIVE INDUSTRY DEVELOPING,different,,,,
617,FEDERAL AUTHORITY FOR IDENTITY AND CITIZENSHIP...,TOSHIBA ELEVATOR AND BUILDING SYSTEMS,different,,,,
618,POSTFINANCE AG,SERVICIO NACIONAL MIGRACION,different,,,,
619,VODACOM PTY,36TH DISTRICT COURT,different,,,,


In [3]:
df.columns

Index(['Company Name', 'Similar Name', 'Same/Different', 'Category',
       'Country same or not', 'Parent same or not', 'Unnamed: 6'],
      dtype='object')

In [4]:
from thefuzz import fuzz

In [5]:
#Function to calculate simple ratio
def calculate_ratio(row):
    return fuzz.ratio(row['Company Name'], row['Similar Name'])

#Function to calculate partial ratio
def calculate_partial_ratio(row):
    x1= fuzz.partial_ratio(row['Company Name'], row['Similar Name'])
    x2= fuzz.partial_ratio(row['Similar Name'], row['Company Name'])
    return max(x1,x2)


# Function to calculate token sort ratio
def calculate_token_sort_ratio(row):
    return fuzz.token_sort_ratio(row['Company Name'], row['Similar Name'])

# Function to calculate token set ratio
def calculate_token_set_ratio(row):
    return fuzz.token_set_ratio(row['Company Name'], row['Similar Name'])

In [6]:
def calculate_word_matching_percentage(row):
    # Split strings into words
    string1 = row['Company Name']
    string2 = row['Similar Name']
    
    # print("strings", string1, " ", string2)
    words1 = set(string1.split())
    words2 = set(string2.split())

    # Calculate the intersection of words
    common_words = words1.intersection(words2)

    # Calculate the percentage of matching words
    matching_percentage = (len(common_words) / len(words1.union(words2))) * 100

    return matching_percentage

In [7]:
def first_word_match(row):
    # Split strings into words
    string1 = row['Company Name']
    string2 = row['Similar Name']
    
    words1 = string1.split()
    words2 = string2.split()

    # Check if there are at least two words in each string
    if len(words1) >= 1 and len(words2) >= 1:
        # Compare the first words
        return words1[0] == words2[0]
    else:
        # If any of the strings has fewer than two words, return False
        return False

In [8]:
def last_word_match(row):

    string1 = row['Company Name']
    string2 = row['Similar Name']
    
    # Split strings into words
    words1 = string1.split()
    words2 = string2.split()

    # Check if there are at least two words in each string
    if len(words1) >= 1 and len(words2) >= 1:
        # Compare the last words
        return words1[-1] == words2[-1]
    else:
        # If any of the strings has fewer than two words, return False
        return False

In [9]:
def character_matching_percentage(row):

    string1 = row['Company Name']
    string2 = row['Similar Name']
    # Calculate the length of the longer string
    max_length = max(len(string1), len(string2))

    # Calculate the number of matching characters
    matching_characters = sum(c1 == c2 for c1, c2 in zip(string1, string2))

    # Calculate the percentage of matching characters
    matching_percentage = (matching_characters / max_length) * 100

    return matching_percentage

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(row):
    # Combine preprocessed strings for vectorization
    sentence1 = " ".join(row['Company Name'].split())
    sentence2 = " ".join(row['Similar Name'].split())

    # Create vectors
    vectorizer = CountVectorizer().fit_transform([sentence1, sentence2])
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))

    return cosine_sim[0, 0]

# # Example usage:
# string1 = "HARRIS COUNTY CSCD"
# string2 = "HARRIS COUNTY"

# cosine_similarity_value = calculate_cosine_similarity(string1, string2)

# print("String 1:", string1)
# print("String 2:", string2)
# print("Cosine Similarity:", cosine_similarity_value)


In [11]:
def get_ngrams(text, n):
    """
    Generate character n-grams for a given text.
    """
    ngrams = [text[i:i + n] for i in range(len(text) - n + 1)]
    return ngrams

def jaccard_similarity(set1, set2):
    """
    Calculate Jaccard similarity between two sets.
    """
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0

def calculate_ngrams_similarity(row):
    string1 = row['Company Name']
    string2 = row['Similar Name']
    
    ngrams1 = set(get_ngrams(string1, 2))
    ngrams2 = set(get_ngrams(string2, 2))

    # Calculate Jaccard similarity
    similarity = jaccard_similarity(ngrams1, ngrams2)
    
    return similarity
    



In [12]:
def common_prefix_percentage(row):
    """
    Calculate the percentage of common prefix characters out of the total characters.
    """
    
    string1 = row['Company Name']
    string2 = row['Similar Name']
    min_len = min(len(string1), len(string2))
    
    # Calculate the common prefix length
    common_prefix_len = 0
    for i in range(min_len):
        if string1[i] == string2[i]:
            common_prefix_len += 1
        else:
            break

    # Calculate the percentage
    if min_len == 0:
        return 0.0  # Avoid division by zero
    else:
        percentage = (common_prefix_len / min_len) * 100
        return percentage

In [13]:
def common_suffix_percentage(row):
    """
    Calculate the percentage of common prefix characters out of the total characters.
    """
    
    string1 = row['Company Name']
    string2 = row['Similar Name']
    min_len = min(len(string1), len(string2))
    
    common_suffix_len = 0
    for i in range(1, min_len + 1):
        if string1[-i] == string2[-i]:
            common_suffix_len += 1
        else:
            break

    # Calculate the percentage
    if min_len == 0:
        return 0.0  # Avoid division by zero
    else:
        percentage = (common_suffix_len / min_len) * 100
        return percentage

In [14]:
def common_prefix_suffix_percentage(row):
    # Calculate the length of the common prefix
    
    string1 = row['Company Name']
    string2 = row['Similar Name']
    prefix_len = 0
    min_len = min(len(string1), len(string2))
    for i in range(min_len):
        if string1[i] == string2[i]:
            prefix_len += 1
        else:
            break

    # Calculate the length of the common suffix
    suffix_len = 0
    for i in range(1, min_len + 1):
        if string1[-i] == string2[-i]:
            suffix_len += 1
        else:
            break

    # Calculate the percentage of common prefix and suffix
    total_len = len(string1) + len(string2) - prefix_len - suffix_len
    percentage = (prefix_len + suffix_len) / total_len * 100 if total_len > 0 else 0

    return percentage


In [15]:
# Add new columns for simple ratio, partial ratio, token sort ratio and token set ratio
df['Simple Ratio'] = df.apply(calculate_ratio, axis=1)
df['partial Ratio'] = df.apply(calculate_partial_ratio, axis=1)
df['Token Sort Ratio'] = df.apply(calculate_token_sort_ratio, axis=1)
df['Token Set Ratio'] = df.apply(calculate_token_set_ratio, axis=1)
df['word match percentage'] = df.apply(calculate_word_matching_percentage, axis=1)
df['first word match'] = df.apply(first_word_match, axis=1)
df['last word match'] = df.apply(last_word_match, axis=1)
df['character_matching_percentage']=df.apply(character_matching_percentage,  axis=1)
df['cosine similarity']=df.apply(calculate_cosine_similarity, axis=1)
df['ngrams']=df.apply(calculate_ngrams_similarity, axis=1)
df['common prefix']=df.apply(common_prefix_percentage, axis=1)
df['common suffix']=df.apply(common_suffix_percentage, axis=1)
df['common prefix suffix']=df.apply(common_prefix_suffix_percentage, axis=1)

In [16]:
print(df)

                                          Company Name  \
0                                   HARRIS COUNTY CSCD   
1                                   HARRIS COUNTY CSCD   
2                   RICHLAND COUNTY EMERGENCY SERVICES   
3                           CS RECURSOS GEOTERMICO ICE   
4                                    HARRIS COUNTY ITC   
..                                                 ...   
616                         CHIFENG JILONG GOLD MINING   
617  FEDERAL AUTHORITY FOR IDENTITY AND CITIZENSHIP...   
618                                     POSTFINANCE AG   
619                                        VODACOM PTY   
620                     KING ABDULLA MEDICAL CITY KAMC   

                                Similar Name Same/Different  \
0                              HARRIS COUNTY           same   
1                          HARRIS COUNTY ITC           same   
2           LANDER COUNTY EMERGENCY SERVICES      different   
3                CS RECURSOS GEOTAARMICO ICE       

In [17]:
df.columns

Index(['Company Name', 'Similar Name', 'Same/Different', 'Category',
       'Country same or not', 'Parent same or not', 'Unnamed: 6',
       'Simple Ratio', 'partial Ratio', 'Token Sort Ratio', 'Token Set Ratio',
       'word match percentage', 'first word match', 'last word match',
       'character_matching_percentage', 'cosine similarity', 'ngrams',
       'common prefix', 'common suffix', 'common prefix suffix'],
      dtype='object')

In [18]:
# Drop the specified columns
columns_to_drop = ['Category', 'Country same or not', 'Parent same or not','Unnamed: 6']
df.drop(columns_to_drop, axis=1, inplace=True)

In [19]:
# Replace 'same' with 1 and 'different' with 0
df.replace({'same': 1, 'different': 0}, inplace=True)
df['Same/Different']

0      1
1      1
2      0
3      1
4      1
      ..
616    0
617    0
618    0
619    0
620    0
Name: Same/Different, Length: 621, dtype: object

In [20]:
# Rename the 'Same/Different' column to 'Target'
df.rename(columns={'Same/Different': 'Target'}, inplace=True)

In [21]:
# output "training_dataset_24_1.csv" with only required fields like => 16 columns as attributes
# Company Name, Similar Name, Target, Simple Ratio, partial Ratio, Token Sort Ratio, Token Set Ratio, word match percentage, first word match,
# last word match, character_matching_percentage, cosine similarity, ngrams, common prefix, common suffix, common prefix suffix
df.to_csv('../output/training_dataset_24_1.csv', index=False)