In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# =====================================================
# CONFIG
# =====================================================
N_CASES = 250
MIN_EMAILS = 18
MAX_EMAILS = 25
START_DATE = datetime.now() - timedelta(days=60)

np.random.seed(42)
random.seed(42)

# =====================================================
# CUSTOMER PROFILES
# =====================================================
CUSTOMER_PROFILES = {
    "cooperative": {
        "tone": "cooperative",
        "intent": "willing_to_pay",
        "templates": [
            "I acknowledge the outstanding balance.",
            "I can pay $500 by Friday.",
            "Thanks for the reminder, arranging payment."
        ]
    },
    "distressed": {
        "tone": "frustrated",
        "intent": "partial_pay",
        "templates": [
            "I lost my job and cannot pay the full amount.",
            "Medical expenses have affected my finances.",
            "I can only make a partial payment right now."
        ]
    },
    "delayer": {
        "tone": "neutral",
        "intent": "delay",
        "templates": [
            "I am reviewing this internally.",
            "I will get back to you later.",
            "Please allow more time."
        ]
    },
    "hostile": {
        "tone": "hostile",
        "intent": "threaten",
        "templates": [
            "Stop harassing me immediately.",
            "I am not paying this amount.",
            "I will contact my lawyer if this continues."
        ]
    },
    "ghost": {
        "tone": "silent",
        "intent": "no_response",
        "templates": [""]
    }
}

CUSTOMER_WEIGHTS = [0.30, 0.25, 0.20, 0.15, 0.10]

# =====================================================
# DCA PROFILES
# =====================================================
DCA_PROFILES = {
    "professional": {
        "tone": "professional",
        "intent": "reminder",
        "templates": [
            "This is a polite reminder regarding your overdue invoice.",
            "Please let us know a suitable payment date."
        ]
    },
    "empathetic": {
        "tone": "empathetic",
        "intent": "negotiation",
        "templates": [
            "We understand your situation and are here to help.",
            "Let us know if a payment plan would help."
        ]
    },
    "aggressive": {
        "tone": "aggressive",
        "intent": "escalation",
        "templates": [
            "Immediate payment is required to avoid escalation.",
            "Failure to pay will result in legal action."
        ]
    },
    "inefficient": {
        "tone": "repetitive",
        "intent": "reminder",
        "templates": [
            "Reminder: payment pending.",
            "Following up again on the overdue amount."
        ]
    }
}

DCA_WEIGHTS = [0.40, 0.25, 0.20, 0.15]

# =====================================================
# CONVERSATION PHASES
# =====================================================
PHASES = [
    "initial_contact",
    "follow_up",
    "customer_response",
    "ptp_or_pushback",
    "resolution_or_escalation"
]

SUBJECTS = [
    "Outstanding Invoice Follow-up",
    "Payment Reminder",
    "Immediate Attention Required",
    "Regarding Overdue Balance"
]

# =====================================================
# GENERATION
# =====================================================
rows = []

for case_idx in range(1, N_CASES + 1):
    case_id = f"CASE_{case_idx:05d}"
    cust_id = f"CUST_{random.randint(1, 900):04d}"

    customer_key = random.choices(list(CUSTOMER_PROFILES.keys()), CUSTOMER_WEIGHTS)[0]
    dca_key = random.choices(list(DCA_PROFILES.keys()), DCA_WEIGHTS)[0]

    customer = CUSTOMER_PROFILES[customer_key]
    dca = DCA_PROFILES[dca_key]

    n_emails = random.randint(MIN_EMAILS, MAX_EMAILS)
    base_time = START_DATE + timedelta(days=random.randint(0, 10))
    phase_idx = 0

    for i in range(n_emails):
        timestamp = base_time + timedelta(days=i, hours=random.randint(1, 6))
        phase = PHASES[phase_idx]

        if i > 0 and i % 4 == 0 and phase_idx < len(PHASES) - 1:
            phase_idx += 1

        if phase in ["initial_contact", "follow_up"]:
            sender = "DCA"
            body = random.choice(dca["templates"])
            tone = dca["tone"]
            intent = dca["intent"]

        else:
            sender = "Customer"
            body = random.choice(customer["templates"])
            tone = customer["tone"]
            intent = customer["intent"]

        rows.append({
            "email_id": f"EML_{case_id}_{i+1}",
            "case_id": case_id,
            "cust_id": cust_id,
            "sender_type": sender,
            "channel": "Email",
            "email_timestamp": timestamp,
            "conversation_phase": phase,
            "email_subject": random.choice(SUBJECTS),
            "email_body": body,
            "customer_behavior_type": customer_key if sender == "Customer" else None,
            "customer_tone_label": tone if sender == "Customer" else None,
            "customer_intent_label": intent if sender == "Customer" else None,
            "dca_behavior_type": dca_key if sender == "DCA" else None,
            "dca_tone_label": tone if sender == "DCA" else None,
            "dca_intent_label": intent if sender == "DCA" else None
        })

email_df = pd.DataFrame(rows)
email_df.sort_values(["case_id", "email_timestamp"], inplace=True)
email_df.to_csv("email_interactions.csv", index=False)

print("✅ Final NLP-ready email dataset generated")
print("Cases:", email_df.case_id.nunique())
print("Emails:", len(email_df))
email_df.head(10)


✅ Final NLP-ready email dataset generated
Cases: 250
Emails: 5419


Unnamed: 0,email_id,case_id,cust_id,sender_type,channel,email_timestamp,conversation_phase,email_subject,email_body,customer_behavior_type,customer_tone_label,customer_intent_label,dca_behavior_type,dca_tone_label,dca_intent_label
0,EML_CASE_00001_1,CASE_00001,CUST_0655,DCA,Email,2025-11-14 07:35:27.157793,initial_contact,Outstanding Invoice Follow-up,Immediate payment is required to avoid escalat...,,,,aggressive,aggressive,escalation
1,EML_CASE_00001_2,CASE_00001,CUST_0655,DCA,Email,2025-11-15 10:35:27.157793,initial_contact,Outstanding Invoice Follow-up,Failure to pay will result in legal action.,,,,aggressive,aggressive,escalation
2,EML_CASE_00001_3,CASE_00001,CUST_0655,DCA,Email,2025-11-16 06:35:27.157793,initial_contact,Payment Reminder,Immediate payment is required to avoid escalat...,,,,aggressive,aggressive,escalation
3,EML_CASE_00001_4,CASE_00001,CUST_0655,DCA,Email,2025-11-17 07:35:27.157793,initial_contact,Payment Reminder,Immediate payment is required to avoid escalat...,,,,aggressive,aggressive,escalation
4,EML_CASE_00001_5,CASE_00001,CUST_0655,DCA,Email,2025-11-18 11:35:27.157793,initial_contact,Payment Reminder,Failure to pay will result in legal action.,,,,aggressive,aggressive,escalation
5,EML_CASE_00001_6,CASE_00001,CUST_0655,DCA,Email,2025-11-19 09:35:27.157793,follow_up,Outstanding Invoice Follow-up,Failure to pay will result in legal action.,,,,aggressive,aggressive,escalation
6,EML_CASE_00001_7,CASE_00001,CUST_0655,DCA,Email,2025-11-20 07:35:27.157793,follow_up,Immediate Attention Required,Failure to pay will result in legal action.,,,,aggressive,aggressive,escalation
7,EML_CASE_00001_8,CASE_00001,CUST_0655,DCA,Email,2025-11-21 08:35:27.157793,follow_up,Payment Reminder,Immediate payment is required to avoid escalat...,,,,aggressive,aggressive,escalation
8,EML_CASE_00001_9,CASE_00001,CUST_0655,DCA,Email,2025-11-22 08:35:27.157793,follow_up,Outstanding Invoice Follow-up,Immediate payment is required to avoid escalat...,,,,aggressive,aggressive,escalation
9,EML_CASE_00001_10,CASE_00001,CUST_0655,Customer,Email,2025-11-23 09:35:27.157793,customer_response,Immediate Attention Required,I acknowledge the outstanding balance.,cooperative,cooperative,willing_to_pay,,,


In [None]:
from google.colab import files

files.download("email_interactions.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Dataset for the Call Trascrit

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# =====================================================
# CONFIG
# =====================================================
N_CASES = 250
MIN_CALLS = 6
MAX_CALLS = 12
START_DATE = datetime.now() - timedelta(days=60)

np.random.seed(42)
random.seed(42)

# =====================================================
# CUSTOMER PROFILES (SAME LOGIC AS EMAIL)
# =====================================================
CUSTOMER_PROFILES = {
    "cooperative": {
        "tone": "cooperative",
        "intent": "willing_to_pay",
        "templates": [
            "Customer acknowledged the balance and agreed to make payment.",
            "Customer confirmed they will pay 500 dollars by Friday.",
            "Customer was polite and confirmed payment intent."
        ]
    },
    "distressed": {
        "tone": "frustrated",
        "intent": "partial_pay",
        "templates": [
            "Customer explained job loss and financial hardship.",
            "Customer stated medical expenses are preventing full payment.",
            "Customer can only make a partial payment this month."
        ]
    },
    "delayer": {
        "tone": "neutral",
        "intent": "delay",
        "templates": [
            "Customer asked for more time and did not commit.",
            "Customer said they will call back later.",
            "Customer avoided giving a payment date."
        ]
    },
    "hostile": {
        "tone": "hostile",
        "intent": "threaten",
        "templates": [
            "Customer was rude and threatened legal action.",
            "Customer accused the agent of harassment.",
            "Customer refused to pay and raised their voice."
        ]
    },
    "ghost": {
        "tone": "silent",
        "intent": "no_response",
        "templates": [
            "Call went unanswered. Voicemail left.",
            "No response from customer.",
            "Call disconnected without conversation."
        ]
    }
}

CUSTOMER_WEIGHTS = [0.30, 0.25, 0.20, 0.15, 0.10]

# =====================================================
# DCA PROFILES
# =====================================================
DCA_PROFILES = {
    "professional": {
        "tone": "professional",
        "intent": "reminder",
        "templates": [
            "Agent politely reminded customer about overdue invoice.",
            "Agent requested confirmation of payment date."
        ]
    },
    "empathetic": {
        "tone": "empathetic",
        "intent": "negotiation",
        "templates": [
            "Agent acknowledged hardship and discussed payment plan.",
            "Agent offered flexibility and support."
        ]
    },
    "aggressive": {
        "tone": "aggressive",
        "intent": "escalation",
        "templates": [
            "Agent warned of escalation and legal action.",
            "Agent demanded immediate payment."
        ]
    },
    "inefficient": {
        "tone": "repetitive",
        "intent": "reminder",
        "templates": [
            "Agent repeated the same reminder.",
            "Agent provided no new information."
        ]
    }
}

DCA_WEIGHTS = [0.40, 0.25, 0.20, 0.15]

# =====================================================
# CONVERSATION PHASES
# =====================================================
PHASES = [
    "initial_contact",
    "follow_up",
    "customer_response",
    "ptp_or_pushback",
    "resolution_or_escalation"
]

# =====================================================
# GENERATION
# =====================================================
rows = []

for case_idx in range(1, N_CASES + 1):
    case_id = f"CASE_{case_idx:05d}"
    cust_id = f"CUST_{random.randint(1, 900):04d}"

    customer_key = random.choices(list(CUSTOMER_PROFILES.keys()), CUSTOMER_WEIGHTS)[0]
    dca_key = random.choices(list(DCA_PROFILES.keys()), DCA_WEIGHTS)[0]

    customer = CUSTOMER_PROFILES[customer_key]
    dca = DCA_PROFILES[dca_key]

    n_calls = random.randint(MIN_CALLS, MAX_CALLS)
    base_time = START_DATE + timedelta(days=random.randint(0, 10))
    phase_idx = 0

    for i in range(n_calls):
        timestamp = base_time + timedelta(days=i, hours=random.randint(1, 6))
        phase = PHASES[phase_idx]

        if i > 0 and i % 2 == 0 and phase_idx < len(PHASES) - 1:
            phase_idx += 1

        caller = random.choices(
            ["DCA", "Customer"],
            weights=[0.7, 0.3]
        )[0]

        if caller == "DCA":
            transcript = random.choice(dca["templates"])
            tone = dca["tone"]
            intent = dca["intent"]
            customer_tone = None
            customer_intent = None
            dca_tone = tone
            dca_intent = intent
            customer_behavior = None
            dca_behavior = dca_key

        else:
            transcript = random.choice(customer["templates"])
            tone = customer["tone"]
            intent = customer["intent"]
            customer_tone = tone
            customer_intent = intent
            dca_tone = None
            dca_intent = None
            customer_behavior = customer_key
            dca_behavior = None

        rows.append({
            "call_id": f"CALL_{case_id}_{i+1}",
            "case_id": case_id,
            "cust_id": cust_id,
            "caller_type": caller,
            "channel": "Call",
            "call_timestamp": timestamp,
            "call_duration_sec": random.randint(60, 900),
            "conversation_phase": phase,
            "call_transcript": transcript,
            "customer_behavior_type": customer_behavior,
            "customer_tone_label": customer_tone,
            "customer_intent_label": customer_intent,
            "dca_behavior_type": dca_behavior,
            "dca_tone_label": dca_tone,
            "dca_intent_label": dca_intent
        })

call_df = pd.DataFrame(rows)
call_df.sort_values(["case_id", "call_timestamp"], inplace=True)
call_df.to_csv("call_transcripts.csv", index=False)

print("✅ Call transcript dataset generated")
print("Cases:", call_df.case_id.nunique())
print("Calls:", len(call_df))
call_df.head(10)


✅ Call transcript dataset generated
Cases: 250
Calls: 2235


Unnamed: 0,call_id,case_id,cust_id,caller_type,channel,call_timestamp,call_duration_sec,conversation_phase,call_transcript,customer_behavior_type,customer_tone_label,customer_intent_label,dca_behavior_type,dca_tone_label,dca_intent_label
0,CALL_CASE_00001_1,CASE_00001,CUST_0655,Customer,Call,2025-11-14 07:49:28.614228,818,initial_contact,Customer was polite and confirmed payment intent.,cooperative,cooperative,willing_to_pay,,,
1,CALL_CASE_00001_2,CASE_00001,CUST_0655,DCA,Call,2025-11-15 10:49:28.614228,92,initial_contact,Agent demanded immediate payment.,,,,aggressive,aggressive,escalation
2,CALL_CASE_00001_3,CASE_00001,CUST_0655,DCA,Call,2025-11-16 06:49:28.614228,577,initial_contact,Agent warned of escalation and legal action.,,,,aggressive,aggressive,escalation
3,CALL_CASE_00001_4,CASE_00001,CUST_0655,DCA,Call,2025-11-17 10:49:28.614228,793,follow_up,Agent warned of escalation and legal action.,,,,aggressive,aggressive,escalation
4,CALL_CASE_00001_5,CASE_00001,CUST_0655,Customer,Call,2025-11-18 11:49:28.614228,285,follow_up,Customer confirmed they will pay 500 dollars b...,cooperative,cooperative,willing_to_pay,,,
5,CALL_CASE_00001_6,CASE_00001,CUST_0655,DCA,Call,2025-11-19 09:49:28.614228,837,customer_response,Agent warned of escalation and legal action.,,,,aggressive,aggressive,escalation
6,CALL_CASE_00001_7,CASE_00001,CUST_0655,DCA,Call,2025-11-20 07:49:28.614228,344,customer_response,Agent demanded immediate payment.,,,,aggressive,aggressive,escalation
7,CALL_CASE_00002_1,CASE_00002,CUST_0160,DCA,Call,2025-11-12 09:49:28.614228,678,initial_contact,Agent demanded immediate payment.,,,,aggressive,aggressive,escalation
8,CALL_CASE_00002_2,CASE_00002,CUST_0160,Customer,Call,2025-11-13 08:49:28.614228,530,initial_contact,Customer was polite and confirmed payment intent.,cooperative,cooperative,willing_to_pay,,,
9,CALL_CASE_00002_3,CASE_00002,CUST_0160,DCA,Call,2025-11-14 10:49:28.614228,140,initial_contact,Agent demanded immediate payment.,,,,aggressive,aggressive,escalation
