In [2]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning / Stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [3]:
import pandas as pd

# ========= 1) F5 schema (Iteration 1) =========
F5_SCHEMA = {
    "f5_core_adc": [
        "F5 Networks", "BIG-IP", "Viprion", "F5 SSL Orchestrator",
        "F5 Load Balancer", "F5 Application Traffic Insight"
    ],
    "f5_security": [
        "F5 ASM", "F5 APM", "F5 Bot Defense", "F5 Distributed Cloud WAAP",
        "F5 Silverline", "F5 Web Application Firewall", "F5 Advanced WAF"
    ],
    "f5_cloud_services": [
        "F5 Distributed Cloud", "F5 Global Server Load Balancing",
        "F5 DNS", "F5 WAF", "F5 Cloud Services"
    ],
    "complementary_cloud": [
        "Azure", "AWS", "Amazon Web Services", "Google Cloud",
        "Kubernetes", "Azure Virtual Machines", "AWS AppStream"
    ],
    "complementary_identity": [
        "Okta", "Azure AD", "Ivanti", "Zscaler", "Ping Identity", "CyberArk"
    ],
    "complementary_workspace": [
        "IGEL", "Unicon", "Thin Client", "Workspace ONE", "Parallels", "Nutanix Frame"
    ]
}

TACTICAL_LABELS = {
    "f5_core_adc": {"high": "High F5 Entrenchment", "medium": "Moderate F5 Presence", "low": "Minimal F5 Footprint"},
    "f5_security": {"high": "Advanced F5 Security Stack", "medium": "F5 Security Present", "low": "No Security Modules"},
    "f5_cloud_services": {"high": "Cloud-Ready F5 Footprint", "medium": "Some Cloud Adoption", "low": "On-Prem Focused"},
    "complementary_cloud": {"high": "Cloud-Aligned", "medium": "Cloud-Capable", "low": "Not Cloud-Aligned"},
    "complementary_identity": {"high": "Identity-Ready", "medium": "Basic Identity Signals", "low": "No Identity Signals"},
    "complementary_workspace": {"high": "Workspace-Enabled", "medium": "Some Workspace Signals", "low": "No Workspace Signals"}
}

def _label_for(category: str, n: int) -> str:
    return (
        TACTICAL_LABELS[category]["high"] if n >= 2 else
        TACTICAL_LABELS[category]["medium"] if n == 1 else
        TACTICAL_LABELS[category]["low"]
    )

def process_technographics(df: pd.DataFrame, tech_col: str = "Technographics") -> pd.DataFrame:
    # ensure string and handle missing
    tech_series = df[tech_col].fillna("").astype(str)

    # pre-split once for speed
    split_list = tech_series.apply(lambda s: [t.strip() for t in s.split("|") if t.strip()])

    for category, keywords in F5_SCHEMA.items():
        # matches per row (comma-separated names)
        matches = split_list.apply(
            lambda items: [t for t in items if any(k.lower() in t.lower() for k in keywords)]
        )
        df[f"{category}_matches"] = matches.apply(lambda lst: ", ".join(lst))
        df[f"{category}_summary"] = matches.apply(lambda lst: _label_for(category, len(lst)))

    return df

# ========= 2) Load → Process → Save =========
# Use your path:
in_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
out_path = "/Applications/WorkDataSets/Demandbase_techno_F5_analysis.csv"

# robust CSV read (keeps all as strings, tolerates odd encodings)
df = pd.read_csv(in_path, dtype=str, keep_default_na=False, encoding="utf-8-sig", engine="python")

# sanity check the expected column
if "Technographics" not in df.columns:
    raise ValueError(f"'Technographics' column not found. Columns present: {list(df.columns)}")

df = process_technographics(df, tech_col="Technographics")

# write output
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print(f"Done. Wrote: {out_path}")

# optional: quick peek at new columns
cols_to_show = [
    "f5_core_adc_matches","f5_core_adc_summary",
    "f5_security_matches","f5_security_summary",
    "f5_cloud_services_matches","f5_cloud_services_summary",
    "complementary_cloud_matches","complementary_cloud_summary",
    "complementary_identity_matches","complementary_identity_summary",
    "complementary_workspace_matches","complementary_workspace_summary"
]
print(df[cols_to_show].head(3))

Done. Wrote: /Applications/WorkDataSets/Demandbase_techno_F5_analysis.csv
                                 f5_core_adc_matches   f5_core_adc_summary  \
0  BIG-IP, BIG-IP Local Traffic Manager, F5 Netwo...  High F5 Entrenchment   
1  BIG-IP, F5 Application Traffic Insight, F5 Net...  High F5 Entrenchment   
2  BIG-IP Advanced WAF, F5 Networks, F5 SSL Orche...  High F5 Entrenchment   

             f5_security_matches  f5_security_summary  \
0  F5 Silverline DDoS Protection  F5 Security Present   
1  F5 Silverline DDoS Protection  F5 Security Present   
2                                 No Security Modules   

                           f5_cloud_services_matches  \
0  F5 Distributed Cloud Bot Defense, F5 Global Se...   
1                    F5 Global Server Load Balancing   
2                    F5 Global Server Load Balancing   

  f5_cloud_services_summary  \
0  Cloud-Ready F5 Footprint   
1       Some Cloud Adoption   
2       Some Cloud Adoption   

                         complemen

In [11]:
import pandas as pd
import csv
import re

# ---- F5 Schema and tactical labels (unchanged) ----
F5_SCHEMA = {
    "f5_core_adc": [
        "F5 Networks", "BIG-IP", "Viprion", "F5 SSL Orchestrator",
        "F5 Load Balancer", "F5 Application Traffic Insight"
    ],
    "f5_security": [
        "F5 ASM", "F5 APM", "F5 Bot Defense", "F5 Distributed Cloud WAAP",
        "F5 Silverline", "F5 Web Application Firewall", "F5 Advanced WAF"
    ],
    "f5_cloud_services": [
        "F5 Distributed Cloud", "F5 Global Server Load Balancing",
        "F5 DNS", "F5 WAF", "F5 Cloud Services"
    ],
    "complementary_cloud": [
        "Azure", "AWS", "Amazon Web Services", "Google Cloud",
        "Kubernetes", "Azure Virtual Machines", "AWS AppStream"
    ],
    "complementary_identity": [
        "Okta", "Azure AD", "Ivanti", "Zscaler", "Ping Identity", "CyberArk"
    ],
    "complementary_workspace": [
        "IGEL", "Unicon", "Thin Client", "Workspace ONE", "Parallels", "Nutanix Frame"
    ]
}

TACTICAL_LABELS = {
    "f5_core_adc": {"high": "High F5 Entrenchment", "medium": "Moderate F5 Presence", "low": "Minimal F5 Footprint"},
    "f5_security": {"high": "Advanced F5 Security Stack", "medium": "F5 Security Present", "low": "No Security Modules"},
    "f5_cloud_services": {"high": "Cloud-Ready F5 Footprint", "medium": "Some Cloud Adoption", "low": "On-Prem Focused"},
    "complementary_cloud": {"high": "Cloud-Aligned", "medium": "Cloud-Capable", "low": "Not Cloud-Aligned"},
    "complementary_identity": {"high": "Identity-Ready", "medium": "Basic Identity Signals", "low": "No Identity Signals"},
    "complementary_workspace": {"high": "Workspace-Enabled", "medium": "Some Workspace Signals", "low": "No Workspace Signals"}
}

# ---- Helper functions ----
def _label_for(category: str, n: int) -> str:
    return (
        TACTICAL_LABELS[category]["high"] if n >= 2 else
        TACTICAL_LABELS[category]["medium"] if n == 1 else
        TACTICAL_LABELS[category]["low"]
    )

def clean_technographics_field(text: str) -> list:
    if not isinstance(text, str):
        return []
    text = text.strip()
    text = re.sub(r'^[\|,;]+|[\|,;]+$', '', text)  # strip pipes/commas at ends
    text = re.sub(r'[^A-Za-z0-9\.\s\-_/]', ' ', text)  # remove ! and odd chars (keep .NET etc.)
    return [t.strip() for t in text.split("|") if t.strip()]

def process_technographics(df: pd.DataFrame, tech_col: str = "Technographics") -> pd.DataFrame:
    tech_series = df[tech_col].fillna("").astype(str)
    split_list = tech_series.apply(clean_technographics_field)

    for category, keywords in F5_SCHEMA.items():
        matches = split_list.apply(lambda items: [t for t in items if any(k.lower() in t.lower() for k in keywords)])
        df[f"{category}_matches"] = matches.apply(lambda lst: ", ".join(lst))
        df[f"{category}_summary"] = matches.apply(lambda lst: _label_for(category, len(lst)))

    return df

def clean_for_looker(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = []
    for i, col in enumerate(df.columns):
        col_str = str(col).strip()
        if col_str == "" or col_str.lower().startswith("unnamed"):
            col_str = f"column_{i+1}"
        col_str = col_str.lower().replace(" ", "_").replace("-", "_").replace("/", "_").strip()
        new_cols.append(col_str)
    df.columns = new_cols

    df = df.dropna(axis=1, how="all")

    for col in df.columns:
        df[col] = df[col].astype(str).str.replace(r"[\r\n]+", " ", regex=True).str.strip()

    return df

def repair_csv_rows(input_path):
    with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
        sample = next(csv.reader(f))
        expected_columns = len(sample)
    repaired = []
    with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) > expected_columns:
                row = row[:expected_columns]
            elif len(row) < expected_columns:
                row += [""] * (expected_columns - len(row))
            repaired.append(row)
    return repaired, expected_columns

# ---- Main execution ----
in_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
out_path = "/Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker101.csv"

rows, expected_cols = repair_csv_rows(in_path)
df = pd.DataFrame(rows[1:], columns=rows[0])

df = process_technographics(df, tech_col="Technographics")
df = clean_for_looker(df)

df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ Done. File cleaned and ready:", out_path)
print(df.head(3).to_string())

✅ Done. File cleaned and ready: /Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker101.csv
                       account_name         journey_stage engaged_known_people                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [12]:
df_test = df.drop(columns=['technographics'], errors='ignore')
df_test.to_csv("/Applications/WorkDataSets/Demandbase_test_no_tech.csv", index=False, encoding="utf-8-sig")
print("✅ Exported test file without Technographics column")

✅ Exported test file without Technographics column


In [13]:
import csv
import re

input_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
expected_columns = None

illegal_pattern = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F]")  # control chars

with open(input_path, "r", encoding="utf-8-sig", newline='') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i == 0:
            expected_columns = len(row)
            print(f"Header detected with {expected_columns} columns.")
            continue
        
        issues = []
        
        # Check column count
        if len(row) != expected_columns:
            issues.append(f"Column count mismatch: {len(row)} columns")
        
        # Check each cell for illegal control characters or unescaped commas
        for col_index, value in enumerate(row):
            if illegal_pattern.search(value):
                issues.append(f"Illegal character in column {col_index+1}")
            if "," in value and not (value.startswith('"') and value.endswith('"')):
                issues.append(f"Unquoted comma in column {col_index+1}")
            if "\n" in value or "\r" in value:
                issues.append(f"Line break in column {col_index+1}")
        
        if issues:
            print(f"Row {i+1} issues: {issues}")
        
        if i > 20:  # stop after scanning first 20 rows to reduce noise
            break

Header detected with 7 columns.
Row 2 issues: ['Unquoted comma in column 4']
Row 3 issues: ['Unquoted comma in column 4']
Row 4 issues: ['Unquoted comma in column 1', 'Unquoted comma in column 4', 'Unquoted comma in column 6']
Row 5 issues: ['Unquoted comma in column 4']
Row 6 issues: ['Unquoted comma in column 1', 'Unquoted comma in column 4', 'Unquoted comma in column 6']
Row 7 issues: ['Unquoted comma in column 4']
Row 8 issues: ['Unquoted comma in column 4']
Row 9 issues: ['Unquoted comma in column 4']
Row 10 issues: ['Unquoted comma in column 4']
Row 11 issues: ['Unquoted comma in column 4']
Row 14 issues: ['Unquoted comma in column 1', 'Unquoted comma in column 4', 'Unquoted comma in column 6']
Row 15 issues: ['Unquoted comma in column 4']
Row 16 issues: ['Unquoted comma in column 4']
Row 17 issues: ['Unquoted comma in column 4']
Row 18 issues: ['Unquoted comma in column 4']
Row 20 issues: ['Unquoted comma in column 4']
Row 22 issues: ['Unquoted comma in column 4']


In [16]:
import pandas as pd
import re

# ========= Load raw file with no parsing to preserve exact data =========
in_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
out_path = "/Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker1011.csv"

# Load CSV safely with python engine
df = pd.read_csv(in_path, dtype=str, keep_default_na=False, encoding="utf-8-sig", engine="python")

# ========= Clean Column Names =========
df.columns = [
    re.sub(r'[^A-Za-z0-9_]+', '_', col.strip().lower()) or f"column_{i+1}"
    for i, col in enumerate(df.columns)
]

# ========= Escape Commas and Clean Cells =========
def escape_cell(value):
    if pd.isna(value):
        return ""
    v = str(value).strip()
    v = re.sub(r"[\r\n]+", " ", v)  # remove line breaks
    # If comma exists and not already quoted, add quotes
    if "," in v and not (v.startswith('"') and v.endswith('"')):
        v = f'"{v}"'
    return v

df = df.applymap(escape_cell)

# ========= Process Technographics (F5 logic stays the same if column exists) =========
def clean_technographics_field(text: str) -> list:
    text = text.strip().strip('|," ')
    text = re.sub(r'[^A-Za-z0-9\.\s\-\|_/]', ' ', text)  # keep only safe chars
    return [t.strip() for t in text.split("|") if t.strip()]

from collections import defaultdict

if "technographics" in df.columns:
    split_list = df["technographics"].apply(lambda x: clean_technographics_field(x.strip('"')))
    
    for category, keywords in F5_SCHEMA.items():
        matches = split_list.apply(lambda items: [t for t in items if any(k.lower() in t.lower() for k in keywords)])
        df[f"{category}_matches"] = matches.apply(lambda lst: ", ".join(lst))
        df[f"{category}_summary"] = matches.apply(lambda lst: _label_for(category, len(lst)))

# ========= Save for Looker Studio =========
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print(f"✅ Successfully cleaned and exported: {out_path}")

  df = df.applymap(escape_cell)


✅ Successfully cleaned and exported: /Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker1011.csv


In [17]:
import csv
import re

input_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
illegal_pattern = re.compile(r'[^\x20-\x7E]')  # anything outside normal printable ASCII

with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    for i, row in enumerate(reader, start=2):
        for col_index, value in enumerate(row):
            if illegal_pattern.search(value):  # detect non-printable characters
                print(f"🚨 Row {i}, Column {header[col_index]} contains invalid character(s): {repr(value)}")
                raise SystemExit
        if len(row) != len(header):
            print(f"🚨 Row {i} has column mismatch: expected {len(header)}, got {len(row)}")
            print(row)
            raise SystemExit

print("✅ No illegal characters or column mismatches found in first pass.")

🚨 Row 15, Column Account Name contains invalid character(s): 'BPO Holdco Coöperatief U.A.'


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [19]:
import pandas as pd
import csv
import re
import unicodedata

# ========= 1) F5 schema and labeling logic =========
F5_SCHEMA = {
    "f5_core_adc": [
        "F5 Networks", "BIG-IP", "Viprion", "F5 SSL Orchestrator",
        "F5 Load Balancer", "F5 Application Traffic Insight"
    ],
    "f5_security": [
        "F5 ASM", "F5 APM", "F5 Bot Defense", "F5 Distributed Cloud WAAP",
        "F5 Silverline", "F5 Web Application Firewall", "F5 Advanced WAF"
    ],
    "f5_cloud_services": [
        "F5 Distributed Cloud", "F5 Global Server Load Balancing",
        "F5 DNS", "F5 WAF", "F5 Cloud Services"
    ],
    "complementary_cloud": [
        "Azure", "AWS", "Amazon Web Services", "Google Cloud",
        "Kubernetes", "Azure Virtual Machines", "AWS AppStream"
    ],
    "complementary_identity": [
        "Okta", "Azure AD", "Ivanti", "Zscaler", "Ping Identity", "CyberArk"
    ],
    "complementary_workspace": [
        "IGEL", "Unicon", "Thin Client", "Workspace ONE", "Parallels", "Nutanix Frame"
    ]
}

TACTICAL_LABELS = {
    "f5_core_adc": {"high": "High F5 Entrenchment", "medium": "Moderate F5 Presence", "low": "Minimal F5 Footprint"},
    "f5_security": {"high": "Advanced F5 Security Stack", "medium": "F5 Security Present", "low": "No Security Modules"},
    "f5_cloud_services": {"high": "Cloud-Ready F5 Footprint", "medium": "Some Cloud Adoption", "low": "On-Prem Focused"},
    "complementary_cloud": {"high": "Cloud-Aligned", "medium": "Cloud-Capable", "low": "Not Cloud-Aligned"},
    "complementary_identity": {"high": "Identity-Ready", "medium": "Basic Identity Signals", "low": "No Identity Signals"},
    "complementary_workspace": {"high": "Workspace-Enabled", "medium": "Some Workspace Signals", "low": "No Workspace Signals"}
}

def _label_for(category: str, n: int) -> str:
    return (
        TACTICAL_LABELS[category]["high"] if n >= 2 else
        TACTICAL_LABELS[category]["medium"] if n == 1 else
        TACTICAL_LABELS[category]["low"]
    )

# ========= Unicode Normalization =========
def normalize_unicode(df: pd.DataFrame) -> pd.DataFrame:
    def normalize_cell(value):
        if pd.isna(value):
            return ""
        value = unicodedata.normalize('NFKD', str(value)).encode('ascii', 'ignore').decode('ascii')
        return value.strip()
    return df.applymap(normalize_cell)

# ========= Clean Technographics =========
def clean_technographics_field(text: str) -> list:
    if not isinstance(text, str):
        return []
    text = text.strip().strip('|," ')
    text = re.sub(r'[^A-Za-z0-9\.\s\-\|_/]', ' ', text)
    return [t.strip() for t in text.split("|") if t.strip()]

def process_technographics(df: pd.DataFrame, tech_col: str = "Technographics") -> pd.DataFrame:
    if tech_col not in df.columns:
        return df
    split_list = df[tech_col].fillna("").astype(str).apply(clean_technographics_field)

    for category, keywords in F5_SCHEMA.items():
        matches = split_list.apply(lambda items: [t for t in items if any(k.lower() in t.lower() for k in keywords)])
        df[f"{category}_matches"] = matches.apply(lambda lst: ", ".join(lst))
        df[f"{category}_summary"] = matches.apply(lambda lst: _label_for(category, len(lst)))
    return df

# ========= Column Header and Content Cleaning =========
def clean_for_looker(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = []
    for i, col in enumerate(df.columns):
        col_str = str(col).strip()
        if col_str == "" or col_str.lower().startswith("unnamed"):
            col_str = f"column_{i+1}"
        col_str = col_str.lower().replace(" ", "_").replace("-", "_").replace("/", "_")
        new_cols.append(col_str)
    df.columns = new_cols

    # Clean cells
    for col in df.columns:
        df[col] = df[col].astype(str).str.replace(r"[\r\n]+", " ", regex=True).str.strip()
        # Escape commas if present
        df[col] = df[col].apply(lambda v: f'"{v}"' if "," in v and not (v.startswith('"') and v.endswith('"')) else v)

    df = df.dropna(axis=1, how="all")
    return df

# ========= Repair CSV Rows =========
def repair_csv_rows(input_path):
    with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
        first_row = next(csv.reader(f))
        expected_cols = len(first_row)
    repaired_rows = [first_row]

    with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for row in reader:
            if len(row) > expected_cols:
                row = row[:expected_cols]
            elif len(row) < expected_cols:
                row += [""] * (expected_cols - len(row))
            repaired_rows.append(row)
    return repaired_rows, expected_cols

# ========= Main Execution =========
in_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
out_path = "/Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker2222.csv"

rows, expected_cols = repair_csv_rows(in_path)
df = pd.DataFrame(rows[1:], columns=rows[0])

df = process_technographics(df, tech_col="Technographics")
df = clean_for_looker(df)
df = normalize_unicode(df)

df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ FINAL OUTPUT READY FOR LOOKER:", out_path)
print(df.head(5).to_string())

✅ FINAL OUTPUT READY FOR LOOKER: /Applications/WorkDataSets/Demandbase_techno_F5_ready_for_looker2222.csv
                       account_name         journey_stage engaged_known_people                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

  return df.applymap(normalize_cell)


In [20]:
import csv
import re
import unicodedata

input_path = "/Applications/WorkDataSets/Demandbase_techno.csv"

illegal_char_pattern = re.compile(r'[^\x20-\x7E]')  # non-printable ASCII or special Unicode

with open(input_path, 'r', encoding='utf-8-sig', newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    for i, row in enumerate(reader, start=2):
        for col_index, value in enumerate(row):
            if illegal_char_pattern.search(value):
                print(f"🚨 Row {i}, Column '{header[col_index]}' contains invalid character(s): {repr(value)}")
                break

🚨 Row 15, Column 'Account Name' contains invalid character(s): 'BPO Holdco Coöperatief U.A.'
🚨 Row 34, Column 'Technographics' contains invalid character(s): '|.NET|.NET 5|123 Reg!|3CX|3CX Phone System|6sense|8X8 Communication|AB Initio Data Quality Environment|ABAP|ABINIT|ADP|ADP Payroll|ADTRAN|AMI|AOL|AOMEI OneKey Recovery|APC|APCON|ASG-Cypress|ASM|ASP.NET|AT Internet|AT&T Global Network Client|AT&T IP Toll-Free|AT&T Managed Internet Service (MIS)|AT&T Managed Threat Detection and Response Service|AT&T Network-Based IP Remote Access|ATG Web Commerce|ATT|AWS App Runner|AWS Artifact|AWS Batch|AWS CloudFormation|AWS CloudTrail|AWS CodeBuild|AWS CodeDeploy|AWS CodePipeline|AWS Config|AWS Data Pipeline|AWS Direct Connect|AWS Elemental|AWS Fargate|AWS Glue|AWS GovCloud|AWS Guardrails|AWS IoT|AWS Key Management Service|AWS Lambda|AWS Managed Services|AWS Organizations|AWS Secrets Manager|AWS Security Hub|AWS Service Catalog|AWS Step Functions|AWS Transit Gateway|AWS Trusted Advisor|AZURE DA

In [21]:
import pandas as pd
import csv
import re
import unicodedata

# ========= Load raw file robustly =========
in_path = "/Applications/WorkDataSets/Demandbase_techno.csv"
out_path = "/Applications/WorkDataSets/Demandbase_techno_F5_READY_FINAL.csv"

# Load file as raw text and repair column lengths
with open(in_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
    sample = next(csv.reader(f))
    expected_cols = len(sample)

repaired = []
with open(in_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) > expected_cols:
            row = row[:expected_cols]
        elif len(row) < expected_cols:
            row += [""] * (expected_cols - len(row))
        repaired.append(row)

# Convert to DataFrame
df = pd.DataFrame(repaired[1:], columns=repaired[0])

# ========= Normalize Unicode to ASCII safely =========
def normalize_ascii(value):
    if pd.isna(value):
        return ""
    # Remove non-ASCII characters
    value = unicodedata.normalize('NFKD', str(value)).encode('ascii', 'ignore').decode('ascii')
    # Remove leftover control characters
    value = re.sub(r'[\x00-\x1F\x7F]+', ' ', value)
    # Strip extra whitespace
    return value.strip()

df = df.applymap(normalize_ascii)

# ========= Clean Column Names =========
df.columns = [
    re.sub(r'[^A-Za-z0-9_]+', '_', col.strip().lower()) or f"column_{i+1}"
    for i, col in enumerate(df.columns)
]

# ========= Clean Technographics Field =========
def split_technographics(text):
    text = text.strip().strip('|," ')
    # Replace pipes with commas
    items = [t.strip() for t in text.split('|') if t.strip()]
    return ", ".join(items)

if "technographics" in df.columns:
    df["technographics"] = df["technographics"].apply(split_technographics)

# ========= Escape commas only when needed =========
for col in df.columns:
    df[col] = df[col].apply(lambda v: f'"{v}"' if "," in v and not (v.startswith('"') and v.endswith('"')) else v)

# ========= Final Save =========
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ CLEANED FILE READY:", out_path)
print("Preview:")
print(df.head(5).to_string())

  df = df.applymap(normalize_ascii)


✅ CLEANED FILE READY: /Applications/WorkDataSets/Demandbase_techno_F5_READY_FINAL.csv
Preview:
                       account_name         journey_stage engaged_known_people                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           