In [26]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning / Stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [27]:
data = pd.read_csv('/Applications/WorkDataSets/DataStore/Data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Account Number / Name (AR)  4993 non-null   object
 1   Account Type                4993 non-null   object
 2   Account Segmentation        4993 non-null   object
 3   Account Line of Business    4993 non-null   object
 4   Territory Level14           4993 non-null   object
 5   Employee Name               4943 non-null   object
 6   Citrix Email Address        4730 non-null   object
 7   Party Number                4993 non-null   object
 8   Ceo 3                       4929 non-null   object
 9   Ceo 2                       4943 non-null   object
 10  Citrix Email                4572 non-null   object
 11  Customer ID                 4993 non-null   object
dtypes: object(12)
memory usage: 468.2+ KB


In [28]:
PLURAL = pd.read_csv('/Applications/WorkDataSets/DataSets/PLURAL.csv')
Er.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17559 entries, 0 to 17558
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Emails             13729 non-null  object
 1   CustomerId_NAR     13729 non-null  object
 2   party_number       13729 non-null  object
 3   company_name       13729 non-null  object
 4   account_type       13729 non-null  object
 5   country            13729 non-null  object
 6   line_of_business   13729 non-null  object
 7   CustomerName_NAR   13729 non-null  object
 8   ats_owner          13729 non-null  object
 9   email_address_ats  13735 non-null  object
 10  function_code      17559 non-null  object
 11  ae_owner           13729 non-null  object
 12  GEO                13729 non-null  object
dtypes: object(13)
memory usage: 1.7+ MB


In [30]:
from rapidfuzz import process, fuzz
import pandas as pd

# Ensure both columns are strings and stripped
data['Employee Name'] = data['Employee Name'].astype(str).str.strip()
data['Citrix Email Address'] = data['Citrix Email Address'].astype(str).str.strip()
PLURAL['ats_owner'] = PLURAL['ats_owner'].astype(str).str.strip()

# Create a dictionary mapping Employee Name -> Email (first email if duplicate names exist)
name_to_email = (
    data[['Employee Name', 'Citrix Email Address']]
    .drop_duplicates(subset=['Employee Name'])
    .set_index('Employee Name')['Citrix Email Address']
    .to_dict()
)

# Fuzzy match function
def fuzzy_match(name, choices, score_cutoff=85):
    if pd.isna(name) or name.strip() == "":
        return None
    match = process.extractOne(name, choices, scorer=fuzz.WRatio)
    if match and match[1] >= score_cutoff:
        return match[0]
    return None

# List of employee names to match against
employee_names = list(name_to_email.keys())

# Apply fuzzy matching
PLURAL['matched_name'] = PLURAL['ats_owner'].apply(lambda x: fuzzy_match(x, employee_names))

# Map matched names to emails
PLURAL['ats_owner_email'] = PLURAL['matched_name'].map(name_to_email)

# Show results
PLURAL[['ats_owner', 'matched_name', 'ats_owner_email']].head(20)

Unnamed: 0,ats_owner,matched_name,ats_owner_email
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,Kuntesh Desai,Kuntesh Desai,Kuntesh.Desai@citrix.com
7,Roy Tokeshi,Roy Tokeshi,roy.tokeshi@citrix.com
8,Roy Tokeshi,Roy Tokeshi,roy.tokeshi@citrix.com
9,,,


In [31]:
# Show sample of rows where email was successfully matched
matched_rows = PLURAL[PLURAL['ats_owner_email'].notna()][['ats_owner', 'matched_name', 'ats_owner_email']].head(20)
matched_rows

Unnamed: 0,ats_owner,matched_name,ats_owner_email
6,Kuntesh Desai,Kuntesh Desai,Kuntesh.Desai@citrix.com
7,Roy Tokeshi,Roy Tokeshi,roy.tokeshi@citrix.com
8,Roy Tokeshi,Roy Tokeshi,roy.tokeshi@citrix.com
18,Thomas Hamann,Thomas Hamann,thomas.hamann@citrix.com
19,Thomas Hamann,Thomas Hamann,thomas.hamann@citrix.com
21,Daniele Vianini,Daniele Vianini,Daniele.Vianini@citrix.com
23,Daniele Vianini,Daniele Vianini,Daniele.Vianini@citrix.com
24,Daniele Vianini,Daniele Vianini,Daniele.Vianini@citrix.com
53,Nanno Ouwehand,Nanno Ouwehand,nanno.ouwehand@citrix.com
56,Rick Roetenberg,Rick Roetenberg,Rick.Roetenberg@citrix.com


In [32]:
# Only remove the helper column, keep order
if 'matched_name' in PLURAL.columns:
    PLURAL = PLURAL.drop(columns=['matched_name'])

# Show sample (to confirm alignment and blanks preserved)
PLURAL[['ats_owner', 'ats_owner_email']].head(20)

Unnamed: 0,ats_owner,ats_owner_email
0,,
1,,
2,,
3,,
4,,
5,,
6,Kuntesh Desai,Kuntesh.Desai@citrix.com
7,Roy Tokeshi,roy.tokeshi@citrix.com
8,Roy Tokeshi,roy.tokeshi@citrix.com
9,,


In [34]:
output_path = '/Applications/WorkDataSets/DataStore/PLURAL_enriched_for_paste.csv'
PLURAL.to_csv(output_path, index=False)
print(f"✅ File saved with preserved order and blanks: {output_path}")

✅ File saved with preserved order and blanks: /Applications/WorkDataSets/DataStore/PLURAL_enriched_for_paste.csv


In [35]:
import re
import pandas as pd
from rapidfuzz import process, fuzz

# --- Normalize names to improve matching ---
def normalize_name(s: str) -> str:
    if pd.isna(s): 
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[^\w\s]", " ", s)  # remove punctuation
    s = re.sub(r"\b[a-z]\b", " ", s)  # remove single-letter middle initials
    s = re.sub(r"\s+", " ", s)  # collapse whitespace
    return s.strip()

# Normalize both datasets
data['Employee Name'] = data['Employee Name'].astype(str).str.strip()
data['Citrix Email Address'] = data['Citrix Email Address'].astype(str).str.strip()
data['name_norm'] = data['Employee Name'].apply(normalize_name)

PLURAL['ats_owner'] = PLURAL['ats_owner'].astype(str).str.strip()
PLURAL['ats_norm'] = PLURAL['ats_owner'].apply(normalize_name)

# Build a dictionary of normalized name -> email
name_to_email = (
    data[['name_norm', 'Citrix Email Address']]
    .drop_duplicates(subset=['name_norm'])
    .set_index('name_norm')['Citrix Email Address']
    .to_dict()
)

# Exact matches first
PLURAL['email_exact'] = PLURAL['ats_norm'].map(name_to_email)

# Fuzzy match fallback
choices = list(name_to_email.keys())

def fuzzy_lookup(norm_name: str, choices, cutoff=87):
    if norm_name == "":
        return None
    match = process.extractOne(norm_name, choices, scorer=fuzz.WRatio)
    if match and match[1] >= cutoff:
        return name_to_email.get(match[0])
    return None

mask = PLURAL['email_exact'].isna()
PLURAL.loc[mask, 'email_fuzzy'] = PLURAL.loc[mask, 'ats_norm'].apply(lambda x: fuzzy_lookup(x, choices))

# Final aligned email column
PLURAL['corrected_email'] = PLURAL['email_exact'].fillna(PLURAL['email_fuzzy'])

# Show aligned result
PLURAL[['ats_owner', 'corrected_email']].head(20)

Unnamed: 0,ats_owner,corrected_email
0,,
1,,
2,,
3,,
4,,
5,,
6,Kuntesh Desai,Kuntesh.Desai@citrix.com
7,Roy Tokeshi,roy.tokeshi@citrix.com
8,Roy Tokeshi,roy.tokeshi@citrix.com
9,,


In [37]:
# Keep only relevant columns in final order
final_output = PLURAL[['ats_owner', 'corrected_email']].copy()

# Define export path
output_path = '/Applications/WorkDataSets/DataStore/PLURAL_corrected_emails.csv'

# Export to CSV
final_output.to_csv(output_path, index=False)

print(f"✅ Export complete! File saved to:\n{output_path}")

✅ Export complete! File saved to:
/Applications/WorkDataSets/DataStore/PLURAL_corrected_emails.csv


In [38]:
import re
from rapidfuzz import process, fuzz

# Step 1: Normalize ATS owner name
def normalize_name(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[^\w\s]", " ", s)  # remove punctuation
    s = re.sub(r"\b[a-z]\b", " ", s)  # remove single initials
    s = re.sub(r"\s+", " ", s)
    return s.strip()

# Step 2: Extract name from email
def name_from_email(email: str) -> str:
    if pd.isna(email):
        return ""
    # Only take portion before @
    local = email.split("@")[0]
    # Replace ., _, - with spaces
    local = re.sub(r"[._\-]", " ", local)
    return normalize_name(local)

# Create normalized columns
PLURAL['ats_norm'] = PLURAL['ats_owner'].apply(normalize_name)
PLURAL['email_norm'] = PLURAL['email_address_ats'].apply(name_from_email)

# Step 3: Build mapping by finding best fuzzy match of ATS name to email-derived name
name_to_email = {}
email_source = PLURAL[['email_norm', 'email_address_ats']].dropna().drop_duplicates()

email_choices = email_source['email_norm'].unique()

for ats_name in PLURAL['ats_norm'].unique():
    if ats_name:
        match = process.extractOne(ats_name, email_choices, scorer=fuzz.WRatio)
        if match and match[1] >= 85:  # threshold can be adjusted
            matched_email = email_source[email_source['email_norm'] == match[0]]['email_address_ats'].iloc[0]
            name_to_email[ats_name] = matched_email

# Step 4: Apply mapping back to original PLURAL frame
PLURAL['corrected_email'] = PLURAL['ats_norm'].map(name_to_email)

# Step 5: Preserve full dataset and export
output_path = '/Applications/WorkDataSets/DataStore/PLURAL_realigned.csv'
PLURAL.to_csv(output_path, index=False)

print(f"✅ Done! File saved at: {output_path}")
print(f"Matched {PLURAL['corrected_email'].notna().sum()} out of {len(PLURAL)} rows.")

✅ Done! File saved at: /Applications/WorkDataSets/DataStore/PLURAL_realigned.csv
Matched 10180 out of 17559 rows.
