In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

df = pd.read_csv('/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/DemandbaseDataCDP2.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477720 entries, 0 to 477719
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             477720 non-null  int64  
 1   Account Name           477720 non-null  object 
 2   First Name             5628 non-null    object 
 3   Last Name              5630 non-null    object 
 4   Title                  0 non-null       float64
 5   Buying Role            3893 non-null    object 
 6   Type                   477720 non-null  object 
 7   Details                477720 non-null  object 
 8   Engagement Minutes     477720 non-null  float64
 9   Activity Date          477720 non-null  object 
 10  Job Title              4042 non-null    object 
 11  Citrix Events Opt-Out  0 non-null       float64
 12  Ispartner              5630 non-null    object 
 13  Account Number         0 non-null       float64
 14  Party Number           5469 non-null

In [4]:
df2 = pd.read_csv('/Applications/WorkDataSets/Most Recent Updates.csv')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18631 entries, 0 to 18630
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   Account Territory Account Number / Name (AR)  18631 non-null  object
 1   Account Territory Customer ID / Name          18631 non-null  object
 2   Tech Plays Tech Play Category                 18631 non-null  object
 3   Tech Plays Tech Play Type Label               18631 non-null  object
 4   Tech Play Status History New Value            18631 non-null  object
 5   Tech Play Status History Old Value            15171 non-null  object
 6   Tech Play Status History Date Start Date      18631 non-null  object
 7   Tech Play Status History Days In Status       18631 non-null  object
dtypes: object(8)
memory usage: 1.1+ MB


In [29]:
import pandas as pd
import re

# Step 1: Load both datasets fresh
print("=== STEP 1: Loading Datasets ===")
df = pd.read_csv('/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/DemandbaseDataCDP2.csv')
df2 = pd.read_csv('/Applications/WorkDataSets/Most Recent Updates.csv')

print(f"df (contacts): {df.shape}")
print(f"df2 (tech plays): {df2.shape}")

# Step 2: Find all F5 tech plays in df2
print("\n=== STEP 2: Finding F5 Tech Plays in df2 ===")

# Look for F5 in tech play columns
f5_plays = df2[
    (df2['Tech Plays Tech Play Type Label'].str.contains('F5', case=False, na=False)) |
    (df2['Tech Plays Tech Play Category'].str.contains('F5', case=False, na=False))
]

print(f"F5 tech play records found: {len(f5_plays)}")

# Step 3: Extract customer IDs from F5 plays  
print("\n=== STEP 3: Extracting Customer IDs from F5 Plays ===")

# Parse the Account Territory Customer ID / Name to get IDs
def extract_customer_id(account_string):
    if pd.isna(account_string):
        return None
    account_string = str(account_string).strip()
    # Pattern: H-CIT-[number] / [name]
    match = re.search(r'H-CIT-(\d+)', account_string)
    if match:
        return match.group(1)
    return None

f5_plays['Customer_ID'] = f5_plays['Account Territory Customer ID / Name'].apply(extract_customer_id)
f5_customer_ids = set(f5_plays['Customer_ID'].dropna())

print(f"Unique F5 customer IDs extracted: {len(f5_customer_ids)}")
print(f"Sample F5 customer IDs: {list(f5_customer_ids)[:10]}")

# Step 4: Find matching activities in df using those IDs
print("\n=== STEP 4: Finding Matching Activities in df ===")

# Parse Account Name in df to get customer IDs for matching
df['Customer_ID'] = df['Account Name'].apply(extract_customer_id)
df_customer_ids = set(df['Customer_ID'].dropna())

print(f"Customer IDs available in df: {len(df_customer_ids)}")

# Find matches
matching_ids = f5_customer_ids.intersection(df_customer_ids)
print(f"Matching customer IDs: {len(matching_ids)}")

if len(matching_ids) > 0:
    # Filter df for F5 account activities
    f5_activities = df[df['Customer_ID'].isin(matching_ids)]
    
    print(f"F5 account activities found: {len(f5_activities)}")
    print(f"Unique F5 accounts: {len(f5_activities['Account Name'].unique())}")
    
    # Show sample
    print(f"\nSample F5 activities:")
    sample_cols = ['Account Name', 'First Name', 'Last Name', 'Job Title']
    print(f5_activities[sample_cols].head(10))

else:
    print("No matching customer IDs found - let's check the formats...")
    print(f"Sample Account Names in df: {df['Account Name'].dropna().head(5).tolist()}")
    print(f"Sample Account Names in df2: {f5_plays['Account Territory Customer ID / Name'].dropna().head(5).tolist()}")

=== STEP 1: Loading Datasets ===
df (contacts): (477720, 21)
df2 (tech plays): (18631, 8)

=== STEP 2: Finding F5 Tech Plays in df2 ===
F5 tech play records found: 2651

=== STEP 3: Extracting Customer IDs from F5 Plays ===
Unique F5 customer IDs extracted: 2293
Sample F5 customer IDs: ['52036066', '45742293', '45311894', '45643196', '51409667', '25269600', '14160900', '51925419', '36977900', '50478071']

=== STEP 4: Finding Matching Activities in df ===
Customer IDs available in df: 0
Matching customer IDs: 0
No matching customer IDs found - let's check the formats...
Sample Account Names in df: ['Defense Logistics Agency', 'State Street Bank & Trust', 'Retail Business Services, LLC', 'The Christ Hospital', 'SoCura GmbH EU']
Sample Account Names in df2: ["H-CIT-46557659 / The Queen's Health Systems", 'H-CIT-39861700 / DNB Nor ASA', 'H-CIT-45612028 / Fred Hutchinson Cancer Center', 'H-CIT-48365577 / Bundesagentur f√ºr Arbeit', "H-CIT-45203307 / Seattle Children's Healthcare System"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f5_plays['Customer_ID'] = f5_plays['Account Territory Customer ID / Name'].apply(extract_customer_id)


In [34]:
# Step 3 (Fixed): Parse BOTH account columns in df2 to extract customer IDs
print("\n=== STEP 3 (FIXED): Parsing Both Account Columns in df2 ===")

def extract_customer_id_enhanced(account_string):
    if pd.isna(account_string):
        return None
    account_string = str(account_string).strip()
    # Pattern: H-CIT-[number] / [name] or just [number]
    match = re.search(r'H-CIT-(\d+)', account_string)
    if match:
        return match.group(1)
    # Try to find standalone numbers
    match = re.search(r'(\d{7,})', account_string)  # 7+ digits
    if match:
        return match.group(1)
    return None

# Parse both account columns
print("Parsing 'Account Territory Account Number / Name (AR)'...")
f5_plays['Customer_ID_Col0'] = f5_plays['Account Territory Account Number / Name (AR)'].apply(extract_customer_id_enhanced)

print("Parsing 'Account Territory Customer ID / Name'...")  
f5_plays['Customer_ID_Col1'] = f5_plays['Account Territory Customer ID / Name'].apply(extract_customer_id_enhanced)

# Combine IDs from both columns
all_f5_ids = set()
all_f5_ids.update(f5_plays['Customer_ID_Col0'].dropna())
all_f5_ids.update(f5_plays['Customer_ID_Col1'].dropna())

print(f"Total unique F5 customer IDs from both columns: {len(all_f5_ids)}")
print(f"Sample combined F5 customer IDs: {list(all_f5_ids)[:10]}")

# Step 4 (Updated): Check df formats and try multiple matching approaches
print("\n=== STEP 4 (UPDATED): Enhanced Matching with df ===")

# Check what format df Account Names have
print("Sample Account Names in df:")
sample_df_accounts = df['Account Name'].dropna().head(10)
for i, name in enumerate(sample_df_accounts):
    print(f"  {i}: {name}")

# Try multiple parsing approaches for df
df['Customer_ID_Method1'] = df['Account Name'].apply(extract_customer_id_enhanced)

# Also try parsing any numeric codes that might be embedded
def find_any_numeric_id(text):
    if pd.isna(text):
        return None
    # Look for any sequence of 7+ digits
    matches = re.findall(r'\d{7,}', str(text))
    return matches[0] if matches else None

df['Customer_ID_Method2'] = df['Account Name'].apply(find_any_numeric_id)

# Check what we got
method1_ids = set(df['Customer_ID_Method1'].dropna())
method2_ids = set(df['Customer_ID_Method2'].dropna())

print(f"Customer IDs found in df (Method 1): {len(method1_ids)}")
print(f"Customer IDs found in df (Method 2): {len(method2_ids)}")

# Try matching with both methods
matches1 = all_f5_ids.intersection(method1_ids)
matches2 = all_f5_ids.intersection(method2_ids)

print(f"Matches with Method 1: {len(matches1)}")
print(f"Matches with Method 2: {len(matches2)}")

all_matches = matches1.union(matches2)
print(f"Total unique matches: {len(all_matches)}")

if len(all_matches) > 0:
    print(f"Sample matching IDs: {list(all_matches)[:10]}")
    
    # Get F5 activities using all matching methods
    f5_activities = df[
        (df['Customer_ID_Method1'].isin(all_matches)) |
        (df['Customer_ID_Method2'].isin(all_matches))
    ]
    
    print(f"\nF5 account activities found: {len(f5_activities)}")
    
else:
    print("Still no matches. Let's examine the data more closely...")
    print(f"\nF5 IDs sample: {list(all_f5_ids)[:5]}")
    print(f"DF Method1 IDs sample: {list(method1_ids)[:5] if method1_ids else 'None'}")
    print(f"DF Method2 IDs sample: {list(method2_ids)[:5] if method2_ids else 'None'}")


=== STEP 3 (FIXED): Parsing Both Account Columns in df2 ===
Parsing 'Account Territory Account Number / Name (AR)'...
Parsing 'Account Territory Customer ID / Name'...
Total unique F5 customer IDs from both columns: 2621
Sample combined F5 customer IDs: ['52036066', '45742293', '45311894', '45643196', '48142562', '51409667', '25269600', '14160900', '51925419', '36977900']

=== STEP 4 (UPDATED): Enhanced Matching with df ===
Sample Account Names in df:
  0: Defense Logistics Agency
  1: State Street Bank & Trust
  2: Retail Business Services, LLC
  3: The Christ Hospital
  4: SoCura GmbH EU
  5: State Street Bank & Trust
  6: CDW Corporation
  7: Defense Logistics Agency
  8: Defense Logistics Agency
  9: The Christ Hospital


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f5_plays['Customer_ID_Col0'] = f5_plays['Account Territory Account Number / Name (AR)'].apply(extract_customer_id_enhanced)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f5_plays['Customer_ID_Col1'] = f5_plays['Account Territory Customer ID / Name'].apply(extract_customer_id_enhanced)


Customer IDs found in df (Method 1): 0
Customer IDs found in df (Method 2): 0
Matches with Method 1: 0
Matches with Method 2: 0
Total unique matches: 0
Still no matches. Let's examine the data more closely...

F5 IDs sample: ['52036066', '45742293', '45311894', '45643196', '48142562']
DF Method1 IDs sample: None
DF Method2 IDs sample: None


In [35]:
# Step 4 (FIXED): Normalize account numbers in df dataset
print("\n=== STEP 4 (FIXED): Finding and Normalizing Account Numbers in df ===")

# Check all columns in df that might contain account numbers
print("Checking all columns in df for potential account numbers...")
potential_id_columns = []

for col in df.columns:
    if any(keyword in col.lower() for keyword in ['account', 'number', 'id', 'customer']):
        potential_id_columns.append(col)

print(f"Potential ID columns in df: {potential_id_columns}")

# Function to normalize account numbers (remove letters, dashes, keep only digits)
def normalize_account_number(text):
    if pd.isna(text):
        return None
    # Convert to string and remove all non-digits
    normalized = re.sub(r'[^\d]', '', str(text))
    # Return only if it's a reasonable length (6+ digits)
    return normalized if len(normalized) >= 6 else None

# Apply normalization to potential ID columns
normalized_ids = set()

for col in potential_id_columns:
    print(f"\nChecking column: {col}")
    if col in df.columns:
        # Show sample values
        sample_vals = df[col].dropna().head(10)
        print(f"  Sample values: {sample_vals.tolist()}")
        
        # Normalize
        df[f'{col}_normalized'] = df[col].apply(normalize_account_number)
        col_ids = set(df[f'{col}_normalized'].dropna())
        
        print(f"  Normalized IDs found: {len(col_ids)}")
        if col_ids:
            print(f"  Sample normalized: {list(col_ids)[:5]}")
            normalized_ids.update(col_ids)

print(f"\nTotal normalized IDs from df: {len(normalized_ids)}")

# Also check if there are any embedded numbers in Account Name
print(f"\nAlso checking for embedded numbers in Account Name...")
df['Account_Name_numbers'] = df['Account Name'].apply(normalize_account_number)
account_name_ids = set(df['Account_Name_numbers'].dropna())
print(f"IDs from Account Name: {len(account_name_ids)}")

# Combine all normalized IDs
all_df_ids = normalized_ids.union(account_name_ids)
print(f"Total unique normalized IDs in df: {len(all_df_ids)}")

# Now try matching with F5 IDs
print(f"\n=== MATCHING ===")
print(f"F5 IDs: {len(all_f5_ids)}")
print(f"DF normalized IDs: {len(all_df_ids)}")

matches = all_f5_ids.intersection(all_df_ids)
print(f"MATCHES FOUND: {len(matches)}")

if len(matches) > 0:
    print(f"Matching IDs: {list(matches)}")
    
    # Find F5 activities
    mask = pd.Series(False, index=df.index)
    for col in df.columns:
        if col.endswith('_normalized'):
            mask = mask | df[col].isin(matches)
    
    f5_activities = df[mask]
    
    print(f"\n‚úÖ SUCCESS!")
    print(f"F5 account activities found: {len(f5_activities)}")
    print(f"Unique accounts: {len(f5_activities['Account Name'].unique())}")
    
    # Show sample
    sample_cols = ['Account Name', 'First Name', 'Last Name', 'Job Title']
    available_cols = [col for col in sample_cols if col in f5_activities.columns]
    print(f"\nSample F5 activities:")
    print(f5_activities[available_cols].head(10))
    
else:
    print("Still no matches after normalization")
    print(f"F5 sample: {list(all_f5_ids)[:5]}")
    print(f"DF sample: {list(all_df_ids)[:5] if all_df_ids else 'No normalized IDs found'}")


=== STEP 4 (FIXED): Finding and Normalizing Account Numbers in df ===
Checking all columns in df for potential account numbers...
Potential ID columns in df: ['Account Name', 'Account Number', 'Party Number', 'CustomerId_NAR', 'CustomerId_NAR_norm', 'Customer_ID', 'Customer_ID_Method1', 'Customer_ID_Method2']

Checking column: Account Name
  Sample values: ['Defense Logistics Agency', 'State Street Bank & Trust', 'Retail Business Services, LLC', 'The Christ Hospital', 'SoCura GmbH EU', 'State Street Bank & Trust', 'CDW Corporation', 'Defense Logistics Agency', 'Defense Logistics Agency', 'The Christ Hospital']
  Normalized IDs found: 0

Checking column: Account Number
  Sample values: []
  Normalized IDs found: 0

Checking column: Party Number
  Sample values: ['P_0034z00003G8kUFAAZ', 'P_0034z00002wuvMWAAY', 'P_0034z00003GAwNoAAL', 'P_003600000286zciAAA', 'P_0034z00002ocozdAAA', 'P_0034z00002wuvMWAAY', 'P_0034z00003G9mwkAAB', 'P_0034z00003G8kUFAAZ', 'P_0034z00003G8kUFAAZ', 'P_0034z000

In [37]:
# Extract all normalized IDs from df
print("=== EXTRACTING ALL NORMALIZED IDs FROM df ===")

# Combine all normalized ID sets
all_df_normalized_ids = set()

# From Party Number (normalized)
party_ids = set(df['Party Number_normalized'].dropna())
print(f"Party Number normalized IDs: {len(party_ids)}")

# From CustomerId_NAR (normalized) 
customer_nar_ids = set(df['CustomerId_NAR_normalized'].dropna())
print(f"CustomerId_NAR normalized IDs: {len(customer_nar_ids)}")

# From CustomerId_NAR_norm (already clean)
customer_nar_norm_ids = set(df['CustomerId_NAR_norm'].dropna().astype(str))
print(f"CustomerId_NAR_norm IDs: {len(customer_nar_norm_ids)}")

# Combine all
all_df_normalized_ids = party_ids.union(customer_nar_ids).union(customer_nar_norm_ids)
print(f"Total unique normalized IDs in df: {len(all_df_normalized_ids)}")

# Now match with F5 IDs
print(f"\n=== MATCHING F5 IDs WITH df IDs ===")
print(f"F5 customer IDs: {len(all_f5_ids)}")
print(f"DF normalized IDs: {len(all_df_normalized_ids)}")

# Convert F5 IDs to string for consistent matching
all_f5_ids_str = {str(id) for id in all_f5_ids}

matches = all_f5_ids_str.intersection(all_df_normalized_ids)
print(f"üéØ MATCHES FOUND: {len(matches)}")

if len(matches) > 0:
    print(f"Matching IDs: {sorted(list(matches))}")
    
    # Create filter mask for F5 activities
    f5_mask = (
        (df['Party Number_normalized'].isin(matches)) |
        (df['CustomerId_NAR_normalized'].isin(matches)) |
        (df['CustomerId_NAR_norm'].astype(str).isin(matches))
    )
    
    f5_activities = df[f5_mask]
    
    print(f"\n‚úÖ SUCCESS! F5 COMPETITIVE OPPORTUNITIES FOUND!")
    print(f"F5 account activities: {len(f5_activities)}")
    print(f"Unique F5 accounts: {len(f5_activities['Account Name'].unique())}")
    
    # Filter for records with contact names
    f5_contacts = f5_activities[
        (f5_activities['First Name'].notna()) & 
        (f5_activities['Last Name'].notna()) &
        (f5_activities['First Name'] != '') & 
        (f5_activities['Last Name'] != '')
    ]
    
    print(f"F5 activities with contact names: {len(f5_contacts)}")
    
    # Save the results
    output_dir = '/Applications/WorkDataSets/DataStore/'
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    filename = 'F5_competitive_opportunities_FINAL.csv'
    filepath = os.path.join(output_dir, filename)
    f5_contacts.to_csv(filepath, index=False)
    
    print(f"\nüìÅ Saved: {filename}")
    print(f"üìä Records: {len(f5_contacts)}")
    print(f"üè¢ Organizations: {len(f5_contacts['Account Name'].unique())}")
    
    # Show sample
    sample_cols = ['Account Name', 'First Name', 'Last Name', 'Job Title', 'Buying Role']
    print(f"\nüë§ Sample F5 competitive contacts:")
    print(f5_contacts[sample_cols].head(10))
    
    # Show top accounts
    print(f"\nüè¢ Top F5 accounts by activity:")
    print(f5_contacts['Account Name'].value_counts().head(15))

else:
    print("No matches found")
    print(f"F5 sample: {list(all_f5_ids_str)[:10]}")
    print(f"DF sample: {list(all_df_normalized_ids)[:10]}")

=== EXTRACTING ALL NORMALIZED IDs FROM df ===
Party Number normalized IDs: 333
CustomerId_NAR normalized IDs: 428
CustomerId_NAR_norm IDs: 450
Total unique normalized IDs in df: 783

=== MATCHING F5 IDs WITH df IDs ===
F5 customer IDs: 2621
DF normalized IDs: 783
üéØ MATCHES FOUND: 447
Matching IDs: ['107046', '107301', '110108', '119097', '12476500', '12554900', '13781200', '14100600', '14800500', '16198300', '16204100', '16204900', '16288300', '16480400', '16561500', '16566400', '1680', '18295600', '18345700', '20504100', '20552900', '20807800', '23140200', '23799400', '24729900', '26267500', '26853600', '27171300', '28772900', '29291600', '29379000', '29505700', '30000200', '31175800', '32887100', '34424100', '35373700', '35817200', '36448200', '36853200', '36977900', '37376900', '37885600', '38048600', '38312400', '38563400', '38630200', '38712500', '38765600', '38849700', '38914200', '39329100', '39438100', '39598900', '39731600', '40367', '40690200', '40827800', '41267', '416206

In [38]:
# Get F5 stage information from the original tech plays
print("=== F5 ACCOUNT STAGES ANALYSIS ===")

# Show what stage/status columns we have in f5_plays
stage_columns = [col for col in f5_plays.columns if 'status' in col.lower() or 'stage' in col.lower()]
print(f"Stage/Status columns available: {stage_columns}")

# Look at the current status values
if 'Tech Play Status History New Value' in f5_plays.columns:
    current_statuses = f5_plays['Tech Play Status History New Value'].value_counts()
    print(f"\nüìä Current F5 Account Stages:")
    print(current_statuses)

# Get organization names and their stages
f5_accounts_with_stages = f5_plays[[
    'Account Territory Customer ID / Name',
    'Customer_ID_Col0', 'Customer_ID_Col1',
    'Tech Plays Tech Play Category',
    'Tech Plays Tech Play Type Label',
    'Tech Play Status History New Value',
    'Tech Play Status History Old Value'
]].copy()

# Parse organization names from the account territory field
def extract_org_name(account_string):
    if pd.isna(account_string):
        return None
    account_string = str(account_string).strip()
    # Pattern: H-CIT-[number] / [organization name]
    match = re.search(r'H-CIT-\d+\s*/\s*(.+)', account_string)
    if match:
        return match.group(1).strip()
    return account_string

f5_accounts_with_stages['Organization_Name'] = f5_accounts_with_stages['Account Territory Customer ID / Name'].apply(extract_org_name)

# Show stages by organization
print(f"\nüè¢ F5 Account Stages by Organization:")
stage_summary = f5_accounts_with_stages.groupby(['Organization_Name', 'Tech Play Status History New Value']).size().unstack(fill_value=0)
print(stage_summary.head(15))

# Create a summary of accounts by stage
print(f"\nüìã F5 Accounts Summary by Stage:")
stage_counts = f5_accounts_with_stages.groupby('Tech Play Status History New Value')['Organization_Name'].nunique()
print(stage_counts)

# Save the stage information
output_dir = '/Applications/WorkDataSets/DataSets/DataStore/'
stage_filename = 'F5_accounts_by_stage.csv'
stage_filepath = os.path.join(output_dir, stage_filename)

f5_accounts_with_stages.to_csv(stage_filepath, index=False)

print(f"\nüíæ Saved F5 account stage information:")
print(f"üìÅ Filename: {stage_filename}")
print(f"üìä Records: {len(f5_accounts_with_stages)}")

# Show sample of stages
print(f"\nüìã Sample F5 account stages:")
sample_cols = ['Organization_Name', 'Tech Plays Tech Play Type Label', 'Tech Play Status History New Value']
print(f5_accounts_with_stages[sample_cols].drop_duplicates().head(15))

# Create a master list of F5 accounts with stage and contact count
print(f"\nüéØ Master F5 Account List with Stages and Contact Counts:")

# Get contact counts per account from our F5 activities
f5_contact_counts = f5_contacts['Account Name'].value_counts()

# Match organizations between datasets and create summary
f5_master_list = []
for org in f5_accounts_with_stages['Organization_Name'].unique():
    if pd.notna(org):
        # Get stage for this org
        org_stages = f5_accounts_with_stages[f5_accounts_with_stages['Organization_Name'] == org]['Tech Play Status History New Value'].unique()
        stage = ', '.join([str(s) for s in org_stages if pd.notna(s)])
        
        # Get contact count (try exact match first, then partial)
        contact_count = 0
        if org in f5_contact_counts:
            contact_count = f5_contact_counts[org]
        else:
            # Try partial matching
            for account_name in f5_contact_counts.index:
                if org.lower() in account_name.lower() or account_name.lower() in org.lower():
                    contact_count += f5_contact_counts[account_name]
        
        f5_master_list.append({
            'Organization': org,
            'Stage': stage,
            'Contact_Count': contact_count,
            'Has_Contacts': 'Yes' if contact_count > 0 else 'No'
        })

# Convert to DataFrame and save
f5_master_df = pd.DataFrame(f5_master_list)
f5_master_df = f5_master_df.sort_values('Contact_Count', ascending=False)

master_filename = 'F5_master_account_list_with_stages.csv'
master_filepath = os.path.join(output_dir, master_filename)
f5_master_df.to_csv(master_filepath, index=False)

print(f"\nüíæ Saved F5 Master Account List:")
print(f"üìÅ Filename: {master_filename}")
print(f"üìä Organizations: {len(f5_master_df)}")

print(f"\nüèÜ Top F5 Opportunities (with contacts and stages):")
print(f5_master_df[f5_master_df['Contact_Count'] > 0].head(20))

=== F5 ACCOUNT STAGES ANALYSIS ===
Stage/Status columns available: ['Tech Play Status History New Value', 'Tech Play Status History Old Value', 'Tech Play Status History Date Start Date', 'Tech Play Status History Days In Status']

üìä Current F5 Account Stages:
Tech Play Status History New Value
Play does not Apply               800
Qualification                     575
Rejected - Customer Rejection     339
Non-Dispositioned                 232
Customer - Deferred to Sy26       171
Presentation                      170
Customer Deferred to SY26         109
On-Hold - Need Exec Engagement     81
Rejected                           60
Proof of Concept                   41
Initial Adoption                   37
Mass Rollout                       25
Rejected - Solution Failure         9
On-Hold - Waiting on RFE            2
Name: count, dtype: int64

üè¢ F5 Account Stages by Organization:
Tech Play Status History New Value            Customer - Deferred to Sy26  \
Organization_Name        

In [40]:
# Create Account Name + Stage mapping
print("=== F5 ACCOUNT NAMES WITH THEIR STAGES ===")

# Get unique F5 account names from our contacts dataset
f5_account_names = f5_contacts['Account Name'].unique()
print(f"F5 accounts with contacts: {len(f5_account_names)}")

# Create mapping of account name to stage
account_stage_mapping = {}

for account_name in f5_account_names:
    # Try to find matching organization in the stage data
    account_stage = "Stage Unknown"
    
    # First try exact match
    matching_orgs = f5_accounts_with_stages[
        f5_accounts_with_stages['Organization_Name'] == account_name
    ]
    
    # If no exact match, try partial matching
    if len(matching_orgs) == 0:
        for org_name in f5_accounts_with_stages['Organization_Name'].dropna():
            if (account_name.lower() in org_name.lower()) or (org_name.lower() in account_name.lower()):
                matching_orgs = f5_accounts_with_stages[
                    f5_accounts_with_stages['Organization_Name'] == org_name
                ]
                break
    
    # Get the stage
    if len(matching_orgs) > 0:
        stages = matching_orgs['Tech Play Status History New Value'].dropna().unique()
        if len(stages) > 0:
            account_stage = ', '.join([str(s) for s in stages])
    
    account_stage_mapping[account_name] = account_stage

# Create clean output
print(f"\nüìã F5 ACCOUNT NAMES AND THEIR STAGES:")
print("-" * 100)
print(f"{'ACCOUNT NAME':<50} | {'STAGE':<30} | {'CONTACTS'}")
print("-" * 100)

# Get contact counts
contact_counts = f5_contacts['Account Name'].value_counts()

for account_name in sorted(account_stage_mapping.keys()):
    stage = account_stage_mapping[account_name]
    contact_count = contact_counts.get(account_name, 0)
    print(f"{account_name:<50} | {stage:<30} | {contact_count}")

# Also create a DataFrame for easy export
account_stage_df = pd.DataFrame([
    {
        'Account_Name': account_name,
        'Stage': stage,
        'Contact_Count': contact_counts.get(account_name, 0)
    }
    for account_name, stage in account_stage_mapping.items()
])

# Sort by contact count (highest first)
account_stage_df = account_stage_df.sort_values('Contact_Count', ascending=False)

# Save it
output_dir = '/Applications/WorkDataSets/DataSets/DataStore/'
stage_mapping_filename = 'F5_account_names_with_stages.csv'
stage_mapping_filepath = os.path.join(output_dir, stage_mapping_filename)

account_stage_df.to_csv(stage_mapping_filepath, index=False)

print(f"\nüíæ Saved account-stage mapping:")
print(f"üìÅ Filename: {stage_mapping_filename}")
print(f"üìä Accounts: {len(account_stage_df)}")

# Show summary by stage
print(f"\nüìä SUMMARY BY STAGE:")
stage_summary = account_stage_df['Stage'].value_counts()
for stage, count in stage_summary.items():
    print(f"{stage}: {count} accounts")

print(f"\nüèÜ TOP F5 ACCOUNTS (by contact count):")
print(account_stage_df.head(15).to_string(index=False))

=== F5 ACCOUNT NAMES WITH THEIR STAGES ===
F5 accounts with contacts: 240

üìã F5 ACCOUNT NAMES AND THEIR STAGES:
----------------------------------------------------------------------------------------------------
ACCOUNT NAME                                       | STAGE                          | CONTACTS
----------------------------------------------------------------------------------------------------
3M Company                                         | Stage Unknown                  | 28
50Hertz Transmission GmbH                          | Presentation                   | 7
ADT US                                             | Play does not Apply            | 14
AMAP : Association de Moyens Assurance des Personnes | Stage Unknown                  | 4
AbbVie Inc                                         | Rejected - Customer Rejection  | 10
Abbott Laboratories                                | Stage Unknown                  | 12
Adani Enterprises Ltd.                             | S

In [42]:
# Create and display the F5 account-stage CSV
print("=== F5 ACCOUNTS WITH STAGES - CSV FORMAT ===")

# Create the DataFrame with account names and stages
account_stage_df = pd.DataFrame([
    {
        'Account_Name': account_name,
        'Stage': stage,
        'Contact_Count': contact_counts.get(account_name, 0)
    }
    for account_name, stage in account_stage_mapping.items()
])

# Sort by contact count (highest first)
account_stage_df = account_stage_df.sort_values('Contact_Count', ascending=False)

# Print CSV format
print("Account_Name,Stage,Contact_Count")
for _, row in account_stage_df.iterrows():
    # Clean any commas in account names or stages for proper CSV format
    account_clean = str(row['Account_Name']).replace(',', ';')
    stage_clean = str(row['Stage']).replace(',', ';')
    print(f'"{account_clean}","{stage_clean}",{row["Contact_Count"]}')

# Save as CSV file
output_dir = '/Applications/WorkDataSets/DataStore/'
import os
os.makedirs(output_dir, exist_ok=True)

csv_filename = 'F5_account_stages.csv'
csv_filepath = os.path.join(output_dir, csv_filename)

account_stage_df.to_csv(csv_filepath, index=False)

print(f"\nüíæ CSV SAVED:")
print(f"üìÅ File: {csv_filename}")
print(f"üìç Path: {csv_filepath}")
print(f"üìä Total F5 accounts: {len(account_stage_df)}")
print(f"üè¢ Accounts with contacts: {len(account_stage_df[account_stage_df['Contact_Count'] > 0])}")

# Quick stats
print(f"\nüìà QUICK STATS:")
print(f"Total F5 accounts: {len(account_stage_df)}")
print(f"Total contacts across all accounts: {account_stage_df['Contact_Count'].sum()}")
print(f"Average contacts per account: {account_stage_df['Contact_Count'].mean():.1f}")

=== F5 ACCOUNTS WITH STAGES - CSV FORMAT ===
Account_Name,Stage,Contact_Count
"Defense Logistics Agency","Qualification",1255
"The Christ Hospital","Customer - Deferred to Sy26",938
"State Street Bank & Trust","Stage Unknown",198
"Benevis LLC","Stage Unknown",161
"IT-Dienstleistungszentrum Berlin","Stage Unknown",136
"Banco Bilbao Vizcaya Argentaria S.A.","Stage Unknown",94
"Genpact India Pvt. Ltd","Play does not Apply",89
"COMPUTACENTER PLC","Rejected - Customer Rejection",87
"Kyndryl/Care New England","Stage Unknown",87
"John Muir Medical Center","Customer Deferred to SY26",86
"AstraZeneca UK Non- R & D","Presentation",63
"Airbus Canada Limited Partnership","Customer - Deferred to Sy26",61
"City of Toronto","Qualification",57
"PSNI","On-Hold - Need Exec Engagement",53
"General Dynamics Information Technology; Inc.","Stage Unknown",44
"Statens vegvesen","Customer - Deferred to Sy26",44
"ITV Services Limited","Qualification",44
"Nationwide Insurance Allied Network Services","Stage Unkn