# Explore Plumber Contact Files

This notebook imports and displays the first 10 lines of various plumber contact files.

In [None]:
import pandas as pd
import os
from pathlib import Path

## 1. Active Plumbers Batch (TXT File)

In [None]:
# Read the active plumbers batch text file
print("=== ACTIVE PLUMBERS BATCH (active_plumbers_batch.txt) ===")
print("\nFirst 10 lines:")
print("-" * 50)

with open('active_plumbers_batch.txt', 'r') as f:
    for i, line in enumerate(f):
        if i < 10:
            print(f"{i+1}: {line.strip()}")
        else:
            break

# Get total line count
with open('active_plumbers_batch.txt', 'r') as f:
    total_lines = sum(1 for line in f)
print(f"\nTotal lines in file: {total_lines}")

## 2. Plumber Contacts Active Only (CSV)

In [None]:
# Read the active only plumber contacts CSV
print("\n=== PLUMBER CONTACTS ACTIVE ONLY (plumber_contacts_active_only_Claude.csv) ===")

df_active = pd.read_csv('plumber_contacts_active_only_Claude.csv')
print(f"\nShape: {df_active.shape}")
print(f"Columns: {list(df_active.columns)}")
print("\nFirst 10 rows:")
print("-" * 50)
df_active.head(10)

## 3. Full Plumber Contacts (CSV)

In [None]:
# Read the full plumber contacts CSV
print("\n=== FULL PLUMBER CONTACTS (plumber_contacts_Claude.csv) ===")

df_full = pd.read_csv('plumber_contacts_Claude.csv')
print(f"\nShape: {df_full.shape}")
print(f"Columns: {list(df_full.columns)}")
print("\nFirst 10 rows:")
print("-" * 50)
df_full.head(10)

## 4. Plumber Contacts for Sheets (CSV)

In [None]:
# Read the plumber contacts for sheets CSV
print("\n=== PLUMBER CONTACTS FOR SHEETS (plumber_contacts_for_sheets.csv) ===")

df_sheets = pd.read_csv('plumber_contacts_for_sheets.csv')
print(f"\nShape: {df_sheets.shape}")
print(f"Columns: {list(df_sheets.columns)}")
print("\nFirst 10 rows:")
print("-" * 50)
df_sheets.head(10)

## 5. Compare File Structures

In [None]:
# Compare the structures of the CSV files
print("\n=== FILE COMPARISON ===")
print("\nFile sizes:")
print(f"- active_plumbers_batch.txt: {os.path.getsize('active_plumbers_batch.txt'):,} bytes")
print(f"- plumber_contacts_active_only_Claude.csv: {os.path.getsize('plumber_contacts_active_only_Claude.csv'):,} bytes")
print(f"- plumber_contacts_Claude.csv: {os.path.getsize('plumber_contacts_Claude.csv'):,} bytes")
print(f"- plumber_contacts_for_sheets.csv: {os.path.getsize('plumber_contacts_for_sheets.csv'):,} bytes")

print("\nRecord counts:")
print(f"- plumber_contacts_active_only_Claude.csv: {len(df_active):,} rows")
print(f"- plumber_contacts_Claude.csv: {len(df_full):,} rows")
print(f"- plumber_contacts_for_sheets.csv: {len(df_sheets):,} rows")

# Check if the full and sheets versions are identical
if df_full.equals(df_sheets):
    print("\n✓ plumber_contacts_Claude.csv and plumber_contacts_for_sheets.csv are IDENTICAL")
else:
    print("\n✗ plumber_contacts_Claude.csv and plumber_contacts_for_sheets.csv are DIFFERENT")

## 6. Data Quality Check

In [None]:
# Check for missing values in the active contacts
print("\n=== DATA QUALITY CHECK (Active Contacts) ===")
print("\nMissing values per column:")
missing = df_active.isnull().sum()
missing_pct = (missing / len(df_active) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Sample of unique values for key columns
if 'Status' in df_active.columns:
    print("\nUnique Status values:")
    print(df_active['Status'].value_counts())

if 'City' in df_active.columns:
    print("\nTop 10 Cities:")
    print(df_active['City'].value_counts().head(10))