In [1]:
import pandas as pd
from graphviz import Digraph, Graph
from collections import defaultdict

In [2]:
import os
current_path = os.getcwd()
print("Current path:", current_path)

Current path: c:\Users\029at\Desktop\dhmy\Big Data\Seminar\Graphviz


In [3]:
# File paths
leads_file = "../Data Kaggle/leads_basic_details.csv"
managers_file = "../Data Kaggle/sales_managers_assigned_leads_details.csv"
no_interest_file = "../Data Kaggle/leads_reasons_for_no_interest.csv"
interactions_file = "../Data Kaggle/leads_interaction_details.csv"
demo_file = "../Data Kaggle/leads_demo_watched_details.csv"

In [4]:
# Load CSV data
leads_df = pd.read_csv(leads_file)
managers_df = pd.read_csv(managers_file)
no_interest_df = pd.read_csv(no_interest_file)
interactions_df = pd.read_csv(interactions_file)
demo_df = pd.read_csv(demo_file)

In [5]:
no_interest_df["reasons_for_not_interested_in_demo"] = no_interest_df["reasons_for_not_interested_in_demo"].replace("Can't afford", "Cannot afford")

In [6]:
def generate_dynamic_flow_chart(leads_df, demo_df, interactions_df, no_interest_df):
    # Calculate statistics from the data
    total_leads = len(leads_df)
    demo_watched = len(demo_df)
    converted = len(interactions_df[interactions_df['lead_stage'] == 'conversion'])
    
    # Calculate stage transitions
    stage_counts = interactions_df['lead_stage'].value_counts().to_dict()
    lead_to_awareness = len(interactions_df[interactions_df['lead_stage'] == 'awareness'])
    awareness_to_consideration = len(interactions_df[interactions_df['lead_stage'] == 'consideration'])
    consideration_to_conversion = converted
    
    # Calculate dropout rates
    dropout_after_lead = total_leads - lead_to_awareness
    dropout_after_awareness = lead_to_awareness - awareness_to_consideration
    dropout_after_consideration = awareness_to_consideration - consideration_to_conversion
    
    # Reasons for no interest - collect all reasons with counts
    no_interest_reasons = defaultdict(int)
    reason_categories = {}
    
    # Process each category column separately
    for col in no_interest_df.columns[1:]:
        category = col.replace('reasons_for_not_interested_in_', '')  
        category = category.replace('reasons_for_not_interested_to_', '')  
        category = category.replace('_', ' ').title()
        reason_counts = no_interest_df[col].value_counts().to_dict()
        reason_categories[category] = reason_counts
    
    # Create the flow chart
    flow = Digraph('Customer_Acquisition_Flow', filename='dynamic_customer_acquisition_flow.gv')
    flow.attr(rankdir='TB', size='16,12', compound='true')  # Increased size
    flow.attr('node', shape='box', style='rounded', fontname='Arial', fontsize='10')
    flow.attr('edge', fontname='Arial', fontsize='9')
    
    # Main stages with actual counts
    with flow.subgraph(name='cluster_main_stages') as c:
        c.attr(label='Customer Acquisition Key Stages (with actual counts)', style='filled', 
               color='lightgrey', fontsize='12')
        
        c.node('Lead', f'Lead\nTotal: {total_leads}\nSources: {leads_df["lead_gen_source"].nunique()}')
        c.node('Awareness', f'Awareness\nDemo Watched: {demo_watched}\n({demo_df["language"].value_counts().to_dict()})')
        c.node('Consideration', f'Consideration\nFollow-ups: {awareness_to_consideration}')
        c.node('Conversion', f'Conversion\nSuccessful: {converted}\nRate: {converted/total_leads:.1%}')
        
        # Connect main stages with transition counts
        c.edge('Lead', 'Awareness', label=f'{lead_to_awareness} ({lead_to_awareness/total_leads:.1%})')
        c.edge('Awareness', 'Consideration', label=f'{awareness_to_consideration} ({awareness_to_consideration/lead_to_awareness:.1%})')
        c.edge('Consideration', 'Conversion', label=f'{consideration_to_conversion} ({consideration_to_conversion/awareness_to_consideration:.1%})')
    
    # Dropout points with counts
    flow.node('Dropout1', f'Dropout\n{dropout_after_lead} leads\n({dropout_after_lead/total_leads:.1%})', 
              shape='diamond', color='red')
    flow.node('Dropout2', f'Dropout\n{dropout_after_awareness} leads\n({dropout_after_awareness/lead_to_awareness:.1%})', 
              shape='diamond', color='red')
    flow.node('Dropout3', f'Dropout\n{dropout_after_consideration} leads\n({dropout_after_consideration/awareness_to_consideration:.1%})', 
              shape='diamond', color='red')
    
    flow.edge('Lead', 'Dropout1', style='dashed')
    flow.edge('Awareness', 'Dropout2', style='dashed')
    flow.edge('Consideration', 'Dropout3', style='dashed')
    
    # Enhanced Reasons section - create a cluster for all reasons
    with flow.subgraph(name='cluster_reasons') as c:
        c.attr(label='Detailed Reasons for No Interest', style='filled', 
               color='lightyellow', fontsize='12', rank='same')
        
        # Create a table-like structure for each category
        for category, reasons in reason_categories.items():
            if not reasons:
                continue
                
            # Create header node for the category
            c.node(f'{category}_header', 
                  f"<<B>{category}</B>>", 
                  shape='plaintext', fontsize='11')
            
            # Create nodes for each reason in the category
            for reason, count in reasons.items():
                reason_text = f"{reason}: {count} leads"
                node_id = f"{category}_{reason}"
                c.node(node_id, reason_text, shape='box', style='filled', 
                      color='lightcoral', fontsize='9')
                
                # Connect header to reason
                c.edge(f'{category}_header', node_id, style='invis')
        
        # Arrange categories horizontally
        category_headers = [f'{cat}_header' for cat in reason_categories.keys()]
        if len(category_headers) > 1:
            c.attr(rank='same')
            for i in range(len(category_headers)-1):
                c.edge(category_headers[i], category_headers[i+1], style='invis')
    
    # Connect dropout points to reasons cluster
    # flow.edge('Dropout1', 'cluster_reasons', lhead='cluster_reasons', style='dotted')
    # flow.edge('Dropout2', 'cluster_reasons', lhead='cluster_reasons', style='dotted')
    # flow.edge('Dropout3', 'cluster_reasons', lhead='cluster_reasons', style='dotted')
    
    # Call interaction patterns (with error handling)
    with flow.subgraph(name='cluster_call_flow') as c:
        c.attr(label='Common Call Interaction Patterns', style='filled', color='lightblue', fontsize='12')
        
        # Get most common call reasons by stage
        call_reasons = interactions_df.groupby(['lead_stage', 'call_reason']).size().unstack().fillna(0)
        
        # Safely get call counts with default 0 if reason doesn't exist
        def get_call_count(stage, reason):
            try:
                return int(call_reasons.loc[stage, reason])
            except (KeyError, AttributeError):
                return 0
        
        intro_calls = get_call_count('lead', 'lead_introduction')
        demo_scheduled_calls = get_call_count('lead', 'demo_scheduled')
        followup_calls = get_call_count('awareness', 'after_demo_followup')
        converted_calls = get_call_count('conversion', 'successful_conversion')

        c.node('Start', 'Start Call')
        c.node('Intro', f"Introduction\n{intro_calls} calls")
        c.node('Demo', f"Demo Scheduled\n{demo_scheduled_calls} calls")
        c.node('Followup', f"Follow-up\n{followup_calls} calls")
        c.node('ConversionCall', f"Conversion Call\n{converted_calls} calls")
        
        c.edges([
            ('Start', 'Intro'),
            ('Intro', 'Demo'),
            ('Demo', 'Followup'),
            ('Followup', 'ConversionCall')
        ])
    
    # Connect the call flow to main stages
    flow.edge('Lead', 'Start', lhead='cluster_call_flow', constraint='false')
    flow.edge('ConversionCall', 'Conversion', ltail='cluster_call_flow', constraint='false')
    
    return flow

# Generate and display the flow chart
dynamic_flow = generate_dynamic_flow_chart(leads_df, demo_df, interactions_df, no_interest_df)
dynamic_flow.render('C:/Users/029at/Desktop/Big Data/Seminar/Graphviz/result/ca_flow_ver_N5', format='pdf', view=True)

'C:\\Users\\029at\\Desktop\\Big Data\\Seminar\\Graphviz\\result\\ca_flow_ver_N5.pdf'

In [8]:
def create_filtered_sales_hierarchy(selected_snr_sm=None):
    hierarchy = Digraph("FilteredSalesHierarchy")
    hierarchy.attr(rankdir="TB")

    # If a specific Senior Sales Manager is selected, filter the data
    if selected_snr_sm:
        filtered_managers_df = managers_df[managers_df["snr_sm_id"] == selected_snr_sm]
    else:
        filtered_managers_df = managers_df  # Show all data if no filter

    # Extract filtered relationships
    senior_managers = filtered_managers_df["snr_sm_id"].unique()
    junior_to_leads = filtered_managers_df.groupby("jnr_sm_id")["lead_id"].apply(list).to_dict()
    senior_to_junior = filtered_managers_df.groupby("snr_sm_id")["jnr_sm_id"].unique().to_dict()

    # Add Senior Managers
    for sm in senior_managers:
        hierarchy.node(sm, shape="box", style="filled", fillcolor="purple", fontcolor="white")

    # Add Junior Managers and connect them
    for sm, juniors in senior_to_junior.items():
        for jm in juniors:
            hierarchy.node(jm, shape="box", style="filled", fillcolor="lightblue")
            hierarchy.edge(sm, jm)

    # Add Leads and connect them
    for jm, leads in junior_to_leads.items():
        for lead in leads:
            hierarchy.node(lead, shape="ellipse", style="filled", fillcolor="pink")
            hierarchy.edge(jm, lead)

    # Save and render
    filename = f"C:/Users/029at/Desktop/Big Data/Seminar/Graphviz/result/sales_hierarchy_{selected_snr_sm if selected_snr_sm else 'all'}"
    # hierarchy.render(filename, format="png", view=False)
    hierarchy.engine = "sfdp"  # Alternative: "neato", "fdp", "twopi"
    hierarchy.render(filename, format="png", view=True)
    return filename

# Example: View only sales hierarchy for a specific Senior Sales Manager
create_filtered_sales_hierarchy("SNR501MG")


6989.57s - Error inserting pydevd breaks.


Traceback (most recent call last):
  File "c:\Users\029at\AppData\Local\Programs\Python\Python310\lib\site-packages\debugpy\_vendored\pydevd\_pydevd_frame_eval\pydevd_modify_bytecode.py", line 328, in insert_pydevd_breaks
    for new_instruction in get_instructions_to_add(
  File "c:\Users\029at\AppData\Local\Programs\Python\Python310\lib\site-packages\debugpy\_vendored\pydevd\_pydevd_frame_eval\pydevd_modify_bytecode.py", line 102, in get_instructions_to_add
    Instr("LOAD_CONST", _pydev_stop_at_break, lineno=stop_at_line - 1),
  File "c:\Users\029at\AppData\Local\Programs\Python\Python310\lib\site-packages\debugpy\_vendored\pydevd\_pydevd_frame_eval\vendored\bytecode\instr.py", line 171, in __init__
    self._set(name, arg, lineno)
  File "c:\Users\029at\AppData\Local\Programs\Python\Python310\lib\site-packages\debugpy\_vendored\pydevd\_pydevd_frame_eval\vendored\bytecode\instr.py", line 239, in _set
    _check_lineno(lineno)
  File "c:\Users\029at\AppData\Local\Programs\Python\Pyth

KeyboardInterrupt: 