## 1. Setup Environment

### 1.1 Import necessary Libraries (Pandas, Fuzzywuzzy)

In [3]:
import pandas as pd
import json

# Install required package if missing
try:
    from fuzzywuzzy import fuzz, process
except ImportError:
    !pip install fuzzywuzzy python-Levenshtein
    from fuzzywuzzy import fuzz, process



### 1.2 Set File paths

In [5]:
crm_path = "crm_data.json"
financial_path = "financial_data.json"
excel_path = "excel_data.xlsx"
staging_path = "staging_table.xlsx"
intermediate_path = "intermediate_validation.xlsx"
master_ref_path = "master_reference_table.xlsx"

## 2. Create Sample Data for System A, System B & Excel Data

### 2.1 Data for System A (JSON Format)

In [8]:
import json
import random

crm_companies = [
    "RapidSoft Consulting", "SkyNet Systems", "ACME Pte Ltd","Novartis Pte Ltd","Leung Kai Fook Medical Co Pte Ltd",
    "Alpha Tech Holdings", "Sunrise Innovations", "Icon Technologies Pte Ltd","Katra Phytochem Pte Ltd",
    "NextGen Solutions Pte Ltd", "EverGreen Technologies", "Zentact Systems Pte Ltd", "Mc Donalds Ltd",
    "Blue Ocean Group", "Global-X Corporation", "Quantum Dynamics","Zenith Infotech","Maker Trading Pte Ltd"
]

crm_data = [{"CompanyID": i+1, "CompanyName": random.choice(crm_companies)} 
            for i in range(100)]

with open("crm_data.json", "w") as f:
    json.dump(crm_data, f, indent=4)


### 2.2 Data for System B(JSON Format)

In [10]:
finance_companies = [
    "RapidSoft Ltd.", "SkyNet Ltd.", "ACME Limited","Novartis Ltd","LKFM Co Pte Ltd",
    "Alpha Technologies", "Sunrise Inc.", "Icon Technologies","Katra",
    "NextGen Solutions Ltd", "EverGreen Tech Ltd","Zentact Systems","Maker Trading",
    "Blue Ocean Ltd", "Global X Corp.", "Quantum Dyn Ltd.","Zenith", "McDonalds"
]

finance_data = [{"CompanyID": i+1, "CompanyName": random.choice(finance_companies)} 
               for i in range(100)]

with open("financial_data.json", "w") as f:
    json.dump(finance_data, f, indent=4)

### 2.3 Data for Excel

In [12]:
import pandas as pd
import random

excel_companies = [
    "Rapid Soft", "Sky Net", "Acme Inc.", "ALPHA TECH","Novartis","LKFM",
    "Sunrise Innovations Pte", "NextGen Solutions","Icon Tech", "Katra",
    "EverGreen Tech", "Blue Ocean", "Global X", "Zentact", "Maker Trading Ltd",
    "Quantum Dynamics Inc.", "Zenith Systems Ltd", "McD"
]

excel_data = pd.DataFrame({
    "CompanyID": range(1, 101),
    "CompanyName": [random.choice(excel_companies) for _ in range(100)]
})

excel_data.to_excel("excel_data.xlsx", index=False)

### 2.4 Verify the generated data

In [14]:

# Verify CRM data
with open("crm_data.json") as f:
    print("CRM Records:", len(json.load(f)))  # Should show 100

# Verify Financial data
with open("financial_data.json") as f:
    print("Financial Records:", len(json.load(f)))  # Should show 100

# Verify Excel data
print("Excel Records:", len(pd.read_excel("excel_data.xlsx")))  # Should show 100


CRM Records: 100
Financial Records: 100
Excel Records: 100


## 3. Loading Data

In [16]:
# Load source data
with open(crm_path) as f:
    crm_data = pd.DataFrame(json.load(f))
with open(financial_path) as f:
    financial_data = pd.DataFrame(json.load(f))
excel_data = pd.read_excel(excel_path)

# Add System Names
crm_data["System Name"] = "CRM"
financial_data["System Name"] = "FIN"
excel_data["System Name"] = "EXCEL"


## 4. Validate the basic assumptions

In [18]:
def clean_company_name(name):
    if pd.isna(name): return ""
    return (str(name).lower()
            .replace(",", "").strip()
            .replace(" ltd", " ").replace(" pte", " ")
            .replace(" inc", " ").replace(" corporation", " ")
            .replace(" corp", " ").replace(" limited", " ")
            .replace(" co", " ").replace("  ", " ")
           )

In [19]:
for df in [crm_data, financial_data, excel_data]:
    df["Prepared Company Name"] = df["CompanyName"].apply(clean_company_name)

## 5. Create Staging Table

Create staging table by merging records from all 3 datasets

In [21]:
staging_table = pd.concat([
    crm_data[["System Name", "CompanyID", "CompanyName", "Prepared Company Name"]],
    financial_data[["System Name", "CompanyID", "CompanyName", "Prepared Company Name"]],
    excel_data[["System Name", "CompanyID", "CompanyName", "Prepared Company Name"]]
], ignore_index=True)


In [22]:
staging_table.to_excel(staging_path, index=False)
print(f"Staging table saved to {staging_path}")


Staging table saved to staging_table.xlsx


## 6. Fuzzy Matching Logic

Apply fuzzy matching across all records in the dataset.  Each record in the dataset is matching with every other record creating a cartesan product of matches.

In [24]:
ref = staging_table[["CompanyID", "Prepared Company Name", "System Name", "CompanyName"]]

In [25]:
matches = []
for _, row in staging_table.iterrows():
    result = process.extractOne(
        row["Prepared Company Name"],
        ref["Prepared Company Name"],
        scorer=fuzz.token_sort_ratio
    )
    
    if result:
        match_name, score, index = result
        matched = ref.iloc[index]
        if( (row["System Name"] == matched["System Name"]) and (row["CompanyID"] == matched["CompanyID"]) ): 
            continue

        matches.append({
            "System Name": row["System Name"],
            "System ID": row["CompanyID"],
            "Company Name": row["CompanyName"],
            "Prepared Company Name": row["Prepared Company Name"],
            "Matched System Name": matched["System Name"],
            "Matched System ID": matched["CompanyID"],
            "Matched Company Name": matched["CompanyName"],
            "Matched Prepared Name": match_name,
            "Confidence": score
        })


## 7. Intermediate Validation Output

Save the output of fuzzy logic into an intermediate dataset

In [27]:
intermediate_df = pd.DataFrame(matches)

In [28]:
intermediate_df.to_excel(intermediate_path, index=False)
print(f"Intermediate validation saved to {intermediate_path}")

Intermediate validation saved to intermediate_validation.xlsx


## 8. User Validation (Simulated threshold)

Technically this dataset should be created after user validation of the above output.

For the sake of this exercise, we are assuming records with confidence > 90 are good to go.

In [30]:
validated_df = intermediate_df[intermediate_df["Confidence"] >= 90]

## 9. Load Master Reference Table

Save the validated data into master reference tables for multiple data applications to refer to

In [32]:
master_ref_table = validated_df[[
    "System Name", "System ID", "Company Name", "Prepared Company Name",
    "Matched System Name", "Matched System ID", "Matched Company Name",
    "Matched Prepared Name", "Confidence"
]]

In [33]:
master_ref_table.to_excel(master_ref_path, index=False)
print(f"Master reference table saved to {master_ref_path}")

Master reference table saved to master_reference_table.xlsx


## 10. Printing Output

In [35]:
master_ref_table

Unnamed: 0,System Name,System ID,Company Name,Prepared Company Name,Matched System Name,Matched System ID,Matched Company Name,Matched Prepared Name,Confidence
0,CRM,10,Icon Technologies Pte Ltd,icon technologies,CRM,7,Icon Technologies Pte Ltd,icon technologies,100
1,CRM,12,NextGen Solutions Pte Ltd,nextgen solutions,CRM,9,NextGen Solutions Pte Ltd,nextgen solutions,100
2,CRM,13,Leung Kai Fook Medical Co Pte Ltd,leung kai fook medical,CRM,4,Leung Kai Fook Medical Co Pte Ltd,leung kai fook medical,100
3,CRM,14,SkyNet Systems,skynet systems,CRM,8,SkyNet Systems,skynet systems,100
4,CRM,17,SkyNet Systems,skynet systems,CRM,8,SkyNet Systems,skynet systems,100
...,...,...,...,...,...,...,...,...,...
259,EXCEL,96,Zentact,zentact,EXCEL,31,Zentact,zentact,100
260,EXCEL,97,ALPHA TECH,alpha tech,EXCEL,41,ALPHA TECH,alpha tech,100
261,EXCEL,98,NextGen Solutions,nextgen solutions,CRM,9,NextGen Solutions Pte Ltd,nextgen solutions,100
262,EXCEL,99,Global X,global x,CRM,18,Global-X Corporation,global-x,100
