In [14]:
import pandas as pd
from pandas import DataFrame
import gspread
from google.oauth2.service_account import Credentials
from google.oauth2 import service_account
from pandas_gbq import to_gbq

## Important Functions
For production use only (example)

In [2]:
def get_google_sheet_as_df(sheet_id: str, sheet_name: str, creds_path: str) -> pd.DataFrame:
    """
    Retrieve data from a Google Sheet and return it as a Pandas DataFrame.

    :param sheet_id: The Google Sheet ID (found in the URL after /d/ and before /edit)
    :param sheet_name: The name of the sheet (tab) inside the document
    :param creds_path: Path to your service account credentials JSON file
    :return: Pandas DataFrame with the sheet data
    """
    scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
    credentials = Credentials.from_service_account_file(creds_path, scopes=scopes)
    client = gspread.authorize(credentials)

    sheet = client.open_by_key(sheet_id).worksheet(sheet_name)
    data = sheet.get_all_records()

    return pd.DataFrame(data)

def load_to_bigquery(df: DataFrame, table_name: str, credentials_path: str, if_exists: str = 'replace'):
    """
    Uploads a DataFrame to Google BigQuery using a service account credential file.

    Args:
        df (DataFrame): The pandas DataFrame to upload.
        table_name (str): Table name in the format 'project_id.dataset.table'.
        credentials_path (str): Path to the service account JSON credentials file.
        if_exists (str): What to do if the table exists: 'fail', 'replace', or 'append'. Default is 'replace'.

    Raises:
        ValueError: If the DataFrame is empty.
        Exception: If the upload fails.
    """
    if df.empty:
        raise ValueError("The DataFrame is empty. Nothing will be uploaded to BigQuery.")

    try:
        credentials = service_account.Credentials.from_service_account_file(credentials_path)
        to_gbq(df, table_name, credentials=credentials, if_exists=if_exists)
        print(f"✅ Data successfully uploaded to {table_name}.")
    except Exception as e:
        print(f"❌ Failed to upload data to BigQuery: {e}")


## Extraction

In [15]:
# In a production environment, it should use the function get_google_sheet_as_df using a google service account to extract this data
deals_meta_df = pd.read_csv('data/deals_meta.csv')
deals_snapshot_df = pd.read_csv('data/deals_snapshot.csv')
owners_df = pd.read_csv('data/owners.csv')
targets_df = pd.read_csv('data/targets.csv')

In [16]:
print(f"Log type: info\n\nDeals Meta\n{deals_meta_df.head()}\nDeals Snapshot\n{deals_snapshot_df.head()}\nOwners\n{owners_df.head()}\nTargets\n{targets_df.head()}")

Log type: info

Deals Meta
   deal_id owner_id created_date  close_date     deal_stage forecast_category  \
0  deal_31  owner_1   2024-12-21  2025-06-02    Closed Lost            Closed   
1  deal_34  owner_1   2024-05-29  2024-11-11    Prospecting          Pipeline   
2  deal_65  owner_1   2024-08-26  2025-02-04    Prospecting          Pipeline   
3  deal_78  owner_1   2024-02-21  2024-03-03     Closed Won            Closed   
4  deal_92  owner_1   2024-06-08  2024-11-24  Qualification          Pipeline   

  record_type          deal_type deal_order_type account_industry  \
0     Channel  Existing Business          Upsell             Tech   
1      Direct  Existing Business      Cross-Sell           Retail   
2      Direct  Existing Business      Cross-Sell       Healthcare   
3     Channel       New Business      New Client          Finance   
4     Channel       New Business      New Client       Healthcare   

  account_region account_size deal_source  deal_amount  
0          LAT

## Custom Open Source/Self Hosted AI Insights

In [17]:

#>>>>>>>>> Filter deals Closed Lost in Q1 <<<<<<<<<<
deals_meta_df['close_date'] = pd.to_datetime(deals_meta_df['close_date'])

# Define Q1 range (adjust year if needed)
start_q1 = "2025-01-01"
end_q1 = "2025-03-31"

lost_deals_q1 = deals_meta_df[
    (deals_meta_df['close_date'].between(start_q1, end_q1)) &
    (deals_meta_df['deal_stage'] == "Closed Lost")
]['deal_id'].unique()

print(f"Total lost deals in Q1: {len(lost_deals_q1)}")

#>>>>>>>>> Filter snapshots for those deals <<<<<<<<<<
lost_snapshots = deals_snapshot_df[deals_snapshot_df['deal_id'].isin(lost_deals_q1)].copy()

lost_snapshots['snapshot_date'] = pd.to_datetime(lost_snapshots['snapshot_date'])
lost_snapshots = lost_snapshots.sort_values(by=['deal_id', 'snapshot_date'])

# >>>>>>>>> Get first & last snapshot for each deal <<<<<<<<<<
lost_deals_df = (
    lost_snapshots.groupby('deal_id')
    .agg(
        old_stage=('stage', 'first'),
        last_stage=('stage', 'last'),
        old_forecast_category=('forecast_category', 'first'),
        last_forecast_category=('forecast_category', 'last'),
        owner_id=('owner_id', 'last'),
        amount=('amount', 'last')
    )
    .reset_index()
)

# ---- 4) Result ----
lost_deals_df


Total lost deals in Q1: 17


Unnamed: 0,deal_id,old_stage,last_stage,old_forecast_category,last_forecast_category,owner_id,amount
0,deal_114,Prospecting,Closed Lost,Pipeline,Closed,owner_11,239748
1,deal_147,Prospecting,Closed Lost,Pipeline,Closed,owner_1,30694
2,deal_271,Prospecting,Closed Lost,Pipeline,Closed,owner_7,58571
3,deal_296,Prospecting,Closed Lost,Pipeline,Closed,owner_13,54979
4,deal_299,Prospecting,Closed Lost,Pipeline,Closed,owner_8,257671
5,deal_314,Prospecting,Closed Lost,Pipeline,Closed,owner_3,31818
6,deal_401,Prospecting,Closed Lost,Pipeline,Closed,owner_9,48684
7,deal_410,Prospecting,Closed Lost,Pipeline,Closed,owner_7,27056
8,deal_47,Prospecting,Closed Lost,Pipeline,Closed,owner_5,14266
9,deal_487,Prospecting,Closed Lost,Pipeline,Closed,owner_9,59781


In [18]:
import importlib
import opensource

# Reload the opensource module to make sure latest code is used
importlib.reload(opensource)

# Sample query
query = """
Analyze the provided data and give me clear, concise insights and answer the main question.

Main question:
- We missed Q1 bookings by 18 %, what blew the forecast?

Specifically:
1. Identify which owner (sales rep) had the highest number of lost deals.
2. Determine which stage in the sales pipeline caused the most forecast prediction errors (deals marked as "Commit" or "Best Case" but ended as Lost).
3. Calculate the total lost amount (sum of amounts from Closed Lost deals).

Then, provide:
- A short textual summary explaining the key insights.
- Practical recommendations on how to reduce forecast errors and prevent future losses (e.g., process improvements, better stage qualification, or forecast adjustments).

Make the answer crisp and easy to understand for a CRO.
"""

# Run analysis asynchronously, non-blocking
opensource.analyze_async(lost_deals_df, query, do_in_the_end=opensource.sync_results_to_main_df, question="We missed Q1 bookings by 18 %, what blew the forecast?")


[97mDone analyzing!           [0m


In [19]:
ai_df = opensource.get_ai_df()
print(ai_df["answer"][0])

 Insights from the analysis of provided data:

1. The sales rep with the highest number of lost deals is owner_7, who has 2 out of 4 lost deals in Q1.

2. The stage "Prospecting" seems to cause the most forecast prediction errors as all the deals that ended as Lost started from this stage.

3. The total lost amount for Q1 is 100498 (sum of amounts from Closed Lost deals).

Short textual summary:
The Q1 bookings have missed the target by 18% due to a higher number of losses in the Prospecting stage, primarily caused by owner_7. To reduce forecast errors and prevent future losses, it is recommended to focus on improving qualification efforts during the Prospecting stage, enhance the sales process for owner_7, and potentially adjust the forecasts based on historical conversion rates from the Prospecting stage.


## Static analysis

In [7]:

def compute_stage_metrics(deals_snapshot_df: pd.DataFrame, stall_days_threshold: int = 30):
    """
    Given a full deals snapshot dataframe, adds columns:
    - 'stage_change': True when stage differs from previous for same deal
    - 'stage_start_date': first date stage started (forward filled)
    - 'days_in_stage': days spent in current stage
    - 'stalled': True if days_in_stage >= threshold
    Returns the full dataframe with these columns.
    """
    df = deals_snapshot_df.copy()
    df['snapshot_date'] = pd.to_datetime(df['snapshot_date'])
    df = df.sort_values(by=['deal_id', 'snapshot_date'])

    # Detect stage changes
    df['stage_change'] = df.groupby('deal_id')['stage'].transform(lambda x: x != x.shift())

    # Compute stage start date (ffill where stage_change is True)
    df['stage_start_date'] = df.groupby('deal_id')['snapshot_date'].transform(
        lambda dates: dates.where(df['stage_change']).ffill()
    )

    # Calculate days in current stage
    df['days_in_stage'] = (df['snapshot_date'] - df['stage_start_date']).dt.days

    # Mark stalled deals
    df['stalled'] = df['days_in_stage'] >= stall_days_threshold

    return df

def get_stalled_result_df(deals_snapshot_df: pd.DataFrame, stall_days_threshold: int = 30):
    """
    Returns a reduced dataframe with only latest snapshot per deal,
    filtered by deals whose snapshot_date is in the current year.
    Includes only selected columns plus stalled flag.
    """
    df = compute_stage_metrics(deals_snapshot_df, stall_days_threshold)

    # Get current year
    today = pd.Timestamp.today()
    current_year = today.year

    # Get latest snapshot per deal
    latest_snapshots = df.sort_values(by='snapshot_date').groupby('deal_id').tail(1)

    # Filter to only deals in current year based on snapshot_date
    filtered = latest_snapshots[
        (latest_snapshots['snapshot_date'].dt.year == current_year)
    ]

    # Select only relevant columns for AI analysis
    columns = ['deal_id', 'stage', 'days_in_stage', 'owner_id', 'forecast_category', 'amount', 'stalled']
    return filtered[columns]


# full_deals_df has all snapshots + metrics
full_deals_df = compute_stage_metrics(deals_snapshot_df, stall_days_threshold=30)

# stalled_result_df filtered by current quarter, latest snapshot per deal (for AI input)
stalled_result_df = get_stalled_result_df(deals_snapshot_df, stall_days_threshold=30)

print(len(stalled_result_df))
print(full_deals_df.head())


300
      deal_id snapshot_date        stage forecast_category  amount  \
7043  deal_10    2025-03-31  Prospecting          Pipeline   30935   
7044  deal_10    2025-04-07  Prospecting          Pipeline   30935   
7045  deal_10    2025-04-14  Prospecting          Pipeline   30935   
7046  deal_10    2025-04-21  Prospecting          Pipeline   30935   
7047  deal_10    2025-04-28  Prospecting          Pipeline   30935   

      close_date owner_id  stage_change stage_start_date  days_in_stage  \
7043  2025-07-22  owner_8          True       2025-03-31              0   
7044  2025-07-22  owner_8         False       2025-03-31              7   
7045  2025-07-22  owner_8         False       2025-03-31             14   
7046  2025-07-22  owner_8         False       2025-03-31             21   
7047  2025-07-22  owner_8         False       2025-03-31             28   

      stalled  
7043    False  
7044    False  
7045    False  
7046    False  
7047    False  


## Custom Open Source/Self Hosted AI Insights

In [8]:
query = """
Analyze the provided data and give me clear, concise insights and answer the main question.

Main question:
- Where are deals stalling, and why?

Specifically:
1. Identify which owner has the highest number of stalled deals.
2. Determine which stage in the sales pipeline caused the most stalled deals.
3. Give me a list of 5 most important stalled deals based on how much time is stalled and the amount.

Then, provide:
- A short textual summary explaining the key insights.
- Practical recommendations on how to reduce forecast errors and prevent future losses (e.g., process improvements, better stage qualification, or forecast adjustments).

Make the answer crisp and easy to understand for a CRO.
"""
opensource.analyze_async(stalled_result_df, query, do_in_the_end=opensource.sync_results_to_main_df, question="Where are deals stalling, and why?")




[97mDone analyzing!           [0m


## Static Analysis

In [9]:
import pandas as pd
from pandas.tseries.offsets import QuarterEnd

def build_pacing_table(owners_df, full_deals_df, targets_df, stall_days_threshold=30):
    today = pd.Timestamp.today()
    next_quarter = (today.month - 1) // 3 + 2
    next_year = today.year + (1 if next_quarter == 5 else 0)
    if next_quarter == 5:
        next_quarter = 1

    # --- 1) Active owners ---
    owners_active = owners_df[
        (owners_df['role_end_date'].isna()) |
        (pd.to_datetime(owners_df['role_end_date'], errors='coerce') >= pd.Timestamp(f"{next_year}-01-01"))
    ]

    # --- 2) Prepare deals ---
    df = full_deals_df.copy()
    df['snapshot_date'] = pd.to_datetime(df['snapshot_date'], errors='coerce')
    df['stage'] = df['stage'].replace({'Discovery': 'Prospecting'})
    current_year = today.year
    deals_this_year = df[df['snapshot_date'].dt.year == current_year]

    # --- 3) Owner completion rate ---
    total_deals = df.groupby("owner_id")["deal_id"].nunique()
    closed_deals = df[df['stage'].str.contains("Closed Won", case=False, na=False)] \
                      .groupby("owner_id")["deal_id"].nunique()
    completion_rate = (closed_deals / total_deals).fillna(0).clip(0, 1)

    # --- 4) STATIC stage & forecast weights ---
    stage_prob = {
        'Proposal': 0.7,
        'Negotiation': 0.8,
        'Contract Sent': 0.9,
        'Qualification': 0.4,
        'Prospecting': 0.3,
        'Closed Won': 1.0,
        'Closed Lost': 0.0
    }
    forecast_weight = {
        'Commit': 0.9,
        'Best Case': 0.7,
        'Pipeline': 0.3,
        'Closed': 1.0
    }

    # --- 5) Filter valid open deals ---
    open_deals = deals_this_year[
        ~deals_this_year['stage'].str.contains("Closed", case=False, na=False)
    ]
    filtered_deals = open_deals[
        ~((open_deals['stalled'] == True) & (open_deals['days_in_stage'] > stall_days_threshold))
    ]

    # Latest snapshot per deal
    latest_deals = (
        filtered_deals
        .sort_values('snapshot_date')
        .groupby('deal_id')
        .tail(1)
        .set_index('deal_id')
    )

    # --- 6) Apply static probabilities ---
    latest_deals['stage_prob'] = latest_deals['stage'].map(stage_prob).fillna(0.2)
    latest_deals['forecast_weight'] = latest_deals['forecast_category'].map(forecast_weight).fillna(0.1)
    latest_deals['completion_rate'] = latest_deals['owner_id'].map(completion_rate).fillna(0)

    latest_deals['adjusted_prob'] = (
        latest_deals['stage_prob'] * latest_deals['forecast_weight'] * latest_deals['completion_rate']
    )
    latest_deals['estimated_value'] = latest_deals['amount'] * latest_deals['adjusted_prob']

    # --- 7) Aggregate by owner ---
    expected_pipeline = latest_deals.groupby('owner_id')['estimated_value'].sum().reset_index()
    open_deals_count = latest_deals.groupby('owner_id').size().reset_index(name='open_deals_count')

    # Targets for the next quarter
    target_quarter_str = f"{next_year} Q{next_quarter}"
    targets_next_q = targets_df[targets_df['quarter'] == target_quarter_str]

    # --- 8) Merge everything ---
    pacing_df = (
        owners_active
        .merge(targets_next_q[['owner_id', 'target_amount']], on='owner_id', how='left')
        .merge(expected_pipeline, on='owner_id', how='left')
        .merge(open_deals_count, on='owner_id', how='left')
    )

    pacing_df['estimated_value'] = pacing_df['estimated_value'].fillna(0)
    pacing_df['open_deals_count'] = pacing_df['open_deals_count'].fillna(0).astype(int)
    pacing_df['pacing'] = pacing_df['estimated_value'] / pacing_df['target_amount']
    pacing_df['gap'] = pacing_df['estimated_value'] - pacing_df['target_amount']

    # --- 9) Simplified classification (Yes, Probably Yes, Probably No, No) ---
    end_of_quarter = today + QuarterEnd(0)
    days_remaining_in_quarter = (end_of_quarter - today).days

    def classify_hit(row):
        if row["estimated_value"] >= row["target_amount"]:
            return "Yes"
        # Heuristic based only on pacing & deal count
        if row["pacing"] >= 0.5 and row["open_deals_count"] > 0:
            return "Probably Yes"
        if row["pacing"] < 0.5 and row["open_deals_count"] > 0:
            return "Probably No"
        return "No"

    pacing_df["will_hit_target"] = pacing_df.apply(classify_hit, axis=1)

    return pacing_df[['owner_id', 'name', 'segment', 'target_amount',
                      'estimated_value', 'pacing', 'gap',
                      'open_deals_count', 'will_hit_target']]


In [10]:


pacing_df = build_pacing_table(owners_df, full_deals_df, targets_df)
pacing_df["pacing"] = pacing_df["pacing"].round(2).astype(float)

pacing_df

Unnamed: 0,owner_id,name,segment,target_amount,estimated_value,pacing,gap,open_deals_count,will_hit_target
0,owner_1,Christopher Moore,SMB,111721,30266.732075,0.27,-81454.267925,25,Probably No
1,owner_2,Amber Jenkins,SMB,111721,79201.42898,0.71,-32519.57102,24,Probably Yes
2,owner_3,Mark Lopez,SMB,111721,47120.025455,0.42,-64600.974545,16,Probably No
3,owner_4,Kristen Mann,SMB,111721,54769.281538,0.49,-56951.718462,22,Probably No
4,owner_5,Dawn Alexander,SMB,111721,92643.665068,0.83,-19077.334932,34,Probably Yes
5,owner_6,Kelly Lin,Mid-Market,138321,109736.456923,0.79,-28584.543077,21,Probably Yes
6,owner_7,Dr. Kimberly Brennan,Mid-Market,138321,86818.062857,0.63,-51502.937143,19,Probably Yes
7,owner_8,Allen Gilmore,Mid-Market,138321,70248.582326,0.51,-68072.417674,18,Probably Yes
8,owner_9,Barbara Robinson,Mid-Market,138321,87188.4972,0.63,-51132.5028,25,Probably Yes
9,owner_10,Anthony Jackson,Enterprise,156941,31806.725385,0.2,-125134.274615,14,Probably No


## Custom Open Source/Self Hosted AI Insights

In [11]:
query = """
You will receive a pacing table generated from sales data. This table summarizes, for each sales owner, the following key metrics:

- owner_id, name, and segment identifying the sales rep and their market segment.
- target_amount: the sales target for the next quarter.
- estimated_value: the estimated pipeline value for the next quarter, calculated by weighting deal amounts based on the deal’s forecast category and stage probability.
- pacing: the ratio of estimated pipeline to target amount, showing how close the owner is to reaching their goal, needs to multiply by 100 to get the real percentage.
- gap: the difference between estimated pipeline and target.
- open_deals_count: number of currently open deals for the owner in the current year that are not stalled more than 30 days.

The estimated_value is calculated by multiplying each deal's amount by three factors:

1. A static stage probability, assigned based on the deal’s current sales stage (e.g., 'Proposal' = 0.7, 'Negotiation' = 0.8, 'Prospecting' = 0.3, 'Closed Won' = 1.0, etc.).
2. A forecast weight, bas red on the forecast category of the deal (e.g., 'Commit' = 0.9, 'Best Case' = 0.7, 'Pipeline' = 0.3, 'Closed' = 1.0).
3. An owner-specific historical completion rate, which is the ratio of closed won deals over total deals for that owner in past periods.

This approach provides a realistic projection of deal closure likelihood, adjusting for deal stage, forecast confidence, and each owner's historical performance.

Your task:

Analyze the pacing table and answer the main question and the following:

Main question: Given today’s pipe, are we on track for next quarter?

- Identify the top 5 owners with the highest risk of falling behind target (“out of track”) based on pacing, deals and gap.
- Explain how you arrived at these estimations, describing your reasoning and methodology.
- Provide a concise summary of key insights.
- Suggest practical recommendations to reduce forecast errors and improve sales outcomes, such as better deal qualification, stage management, or forecast adjustments.

Make the response clear and easy to understand for a Chief Revenue Officer (CRO).
"""

opensource.analyze_async(pacing_df, query, do_in_the_end=opensource.export_data, question="Given today’s pipe, are we on track for next quarter?")




[97mDone analyzing!           [0m
Exported


## Clean

In [12]:
full_deals_df = full_deals_df.merge(
    pacing_df[["owner_id", "name"]],
    on="owner_id",
    how="left"
)

pacing_df[["estimated_value", "gap", "target_amount"]] = pacing_df[["estimated_value", "gap", "target_amount"]].round(0).astype(float)
pacing_df["pacing"] = pacing_df["pacing"].round(2).astype(float)

pacing_df

Unnamed: 0,owner_id,name,segment,target_amount,estimated_value,pacing,gap,open_deals_count,will_hit_target
0,owner_1,Christopher Moore,SMB,111721.0,30267.0,0.27,-81454.0,25,Probably No
1,owner_2,Amber Jenkins,SMB,111721.0,79201.0,0.71,-32520.0,24,Probably Yes
2,owner_3,Mark Lopez,SMB,111721.0,47120.0,0.42,-64601.0,16,Probably No
3,owner_4,Kristen Mann,SMB,111721.0,54769.0,0.49,-56952.0,22,Probably No
4,owner_5,Dawn Alexander,SMB,111721.0,92644.0,0.83,-19077.0,34,Probably Yes
5,owner_6,Kelly Lin,Mid-Market,138321.0,109736.0,0.79,-28585.0,21,Probably Yes
6,owner_7,Dr. Kimberly Brennan,Mid-Market,138321.0,86818.0,0.63,-51503.0,19,Probably Yes
7,owner_8,Allen Gilmore,Mid-Market,138321.0,70249.0,0.51,-68072.0,18,Probably Yes
8,owner_9,Barbara Robinson,Mid-Market,138321.0,87188.0,0.63,-51133.0,25,Probably Yes
9,owner_10,Anthony Jackson,Enterprise,156941.0,31807.0,0.2,-125134.0,14,Probably No


## Load

In [13]:
# full_deals_df.to_csv("full_deals.csv", index=False)
# pacing_df.to_csv("pacing.csv", index=False)