In [11]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning / Stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [13]:

import glob
import os

# Your directory
csv_directory = "/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/"

# Get all CSV files in the directory
csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))

print(f"üîç Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")

if len(csv_files) == 0:
    print("‚ùå No CSV files found in the directory!")
else:
    # Combine all CSV files
    combined_data = []
    
    for file in csv_files:
        print(f"\nüìñ Loading: {os.path.basename(file)}")
        df = pd.read_csv(file)
        
        # Optional: Add source file column to track where each row came from
        df['source_file'] = os.path.basename(file)
        
        combined_data.append(df)
        print(f"   ‚úÖ Loaded {len(df):,} rows")
    
    # Concatenate all dataframes
    final_df = pd.concat(combined_data, ignore_index=True)
    
    # Results summary
    print(f"\nüéâ COMBINATION COMPLETE!")
    print(f"   Files combined: {len(csv_files)}")
    print(f"   Total rows: {len(final_df):,}")
    print(f"   Total columns: {len(final_df.columns)}")
    
    # Show breakdown by source file
    print(f"\nüìä Rows per file:")
    source_counts = final_df['source_file'].value_counts()
    for filename, count in source_counts.items():
        print(f"   {filename}: {count:,} rows")
    
    # Save the combined file
    output_file = "combined_all_data.csv"
    final_df.to_csv(output_file, index=False)
    
    print(f"\nüíæ SUCCESS! Combined file saved as: {output_file}")
    print(f"   Location: {os.path.abspath(output_file)}")

üîç Found 7 CSV files:
  - 251020_ENT_2900__activity (3).csv
  - 251020_ENT_2900__activity (2).csv
  - 251020_ENT_2900__activity (5).csv
  - 251020_ENT_2900__activity.csv
  - 251020_ENT_2900__activity (4).csv
  - 251020_ENT_2900__activity (7).csv
  - 251020_ENT_2900__activity (1).csv

üìñ Loading: 251020_ENT_2900__activity (3).csv
   ‚úÖ Loaded 309,673 rows

üìñ Loading: 251020_ENT_2900__activity (2).csv
   ‚úÖ Loaded 277,658 rows

üìñ Loading: 251020_ENT_2900__activity (5).csv
   ‚úÖ Loaded 387,102 rows

üìñ Loading: 251020_ENT_2900__activity.csv


  df = pd.read_csv(file)


   ‚úÖ Loaded 319,882 rows

üìñ Loading: 251020_ENT_2900__activity (4).csv
   ‚úÖ Loaded 267,546 rows

üìñ Loading: 251020_ENT_2900__activity (7).csv
   ‚úÖ Loaded 258,656 rows

üìñ Loading: 251020_ENT_2900__activity (1).csv
   ‚úÖ Loaded 156,881 rows

üéâ COMBINATION COMPLETE!
   Files combined: 7
   Total rows: 1,977,398
   Total columns: 19

üìä Rows per file:
   251020_ENT_2900__activity (5).csv: 387,102 rows
   251020_ENT_2900__activity.csv: 319,882 rows
   251020_ENT_2900__activity (3).csv: 309,673 rows
   251020_ENT_2900__activity (2).csv: 277,658 rows
   251020_ENT_2900__activity (4).csv: 267,546 rows
   251020_ENT_2900__activity (7).csv: 258,656 rows
   251020_ENT_2900__activity (1).csv: 156,881 rows

üíæ SUCCESS! Combined file saved as: combined_all_data.csv
   Location: /Users/jacobmarchand/Data Science Templates/combined_all_data.csv


In [14]:
combined_df = pd.read_csv("/Users/jacobmarchand/Data Science Templates/combined_all_data.csv")
combined_df.info()

  combined_df = pd.read_csv("/Users/jacobmarchand/Data Science Templates/combined_all_data.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1977398 entries, 0 to 1977397
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Account Name           object 
 1   First Name             object 
 2   Last Name              object 
 3   Title                  object 
 4   Buying Role            object 
 5   Type                   object 
 6   Details                object 
 7   Engagement Minutes     float64
 8   Activity Date          object 
 9   Job Title              object 
 10  Citrix Events Opt-Out  float64
 11  Ispartner              object 
 12  Account Number         float64
 13  Party Number           object 
 14  Territory              float64
 15  Fatigue Level          object 
 16  Party Name             float64
 17  CustomerId_NAR         object 
 18  source_file            object 
dtypes: float64(5), object(14)
memory usage: 286.6+ MB


In [15]:
AcD = pd.read_csv('/Applications/WorkDataSets/DataStore/Account Details.csv')
AcD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5386 entries, 0 to 5385
Data columns (total 12 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Oracle Account Customer ID                5386 non-null   object 
 1   Oracle Account Customer Name              5386 non-null   object 
 2   Oracle Account Account Segmentation       5386 non-null   object 
 3   Oracle Account Business Unit              5386 non-null   object 
 4   Eloqua Accounts Account Engagement Score  5259 non-null   float64
 5   AE Person Name                            5386 non-null   object 
 6   AE Level14 Territory Name                 5386 non-null   object 
 7   ATS Team Person Name                      4358 non-null   object 
 8   Oracle Account Line of Business           5369 non-null   object 
 9   Oracle Account Country                    5386 non-null   object 
 10  Arr Total Arr                       

In [17]:
import re

# Create normalized versions (keep only digits)
combined_df['CustomerId_NAR_norm'] = combined_df['CustomerId_NAR'].astype(str).str.extract('(\d+)', expand=False)
AcD['Oracle_CustID_norm'] = AcD['Oracle Account Customer ID'].astype(str).str.extract('(\d+)', expand=False)

In [18]:
# Unique values of each
combined_unique_ids = combined_df['CustomerId_NAR_norm'].dropna().unique()
acd_unique_ids = AcD['Oracle_CustID_norm'].dropna().unique()

# Unique matches
unique_matches = set(combined_unique_ids) & set(acd_unique_ids)
print(f"‚úÖ Unique matching accounts: {len(unique_matches)}")

‚úÖ Unique matching accounts: 1689


In [19]:
# Filter activity rows that match AcD accounts
combined_matches_df = combined_df[combined_df['CustomerId_NAR_norm'].isin(unique_matches)]

# Total (non-unique) matches
total_activity_matches = len(combined_matches_df)
print(f"üìä Total matched activity rows (non-unique): {total_activity_matches}")

üìä Total matched activity rows (non-unique): 1585076


Total rows in AcD: 5386
Unique Oracle Account Customer IDs: 4087


In [None]:
database = pd.read_csv('/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/DemandbaseDataCDP')


FileNotFoundError: [Errno 2] No such file or directory: '/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/DemandbaseDataCDP'

In [29]:
unique_party_numbers = database['party_number'].nunique()
total_rows = len(database)

print(f"‚úÖ Unique party numbers: {unique_party_numbers}")
print(f"üìä Total rows: {total_rows}")
print(f"üîÅ Duplicate party numbers: {total_rows - unique_party_numbers}")

‚úÖ Unique party numbers: 770
üìä Total rows: 14235
üîÅ Duplicate party numbers: 13465


In [30]:
# Keep only digits from ID fields
combined_df['CustomerId_NAR_norm'] = combined_df['CustomerId_NAR'].astype(str).str.extract('(\d+)', expand=False)
database['party_number_norm'] = database['party_number'].astype(str).str.extract('(\d+)', expand=False)

In [31]:
valid_party_ids = database['party_number_norm'].dropna().unique()
print(f"Number of valid account IDs to match against: {len(valid_party_ids)}")

Number of valid account IDs to match against: 770


In [32]:
filtered_activity_df = combined_df[combined_df['CustomerId_NAR_norm'].isin(valid_party_ids)]

In [33]:
print(f"Original combined_df rows: {len(combined_df)}")
print(f"Filtered rows (only matched accounts): {len(filtered_activity_df)}")
print(f"Unique accounts in filtered activity: {filtered_activity_df['CustomerId_NAR_norm'].nunique()}")

Original combined_df rows: 1977398
Filtered rows (only matched accounts): 477720
Unique accounts in filtered activity: 450


In [46]:
filtered_activity_df.to_csv('/Applications/WorkDataSets/DataStore/Demandbase CDP 4.0/DemandbaseDataCDP2')