In [None]:
!pip install pandas matplotlib openai streamlit python-dotenv


In [None]:
import pandas as pd
import json
import os
import openai
from dotenv import load_dotenv

# --------------------------------
# CONFIG
# --------------------------------
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# --------------------------------
# STEP 1: SCHEMA INSPECTION
# --------------------------------
def inspect_schema(df):
    schema = []
    for col in df.columns:
        schema.append({
            "column": col,
            "dtype": str(df[col].dtype),
            "missing_pct": round(df[col].isna().mean() * 100, 2),
            "unique_values": int(df[col].nunique())
        })
    return schema

In [None]:
# --------------------------------
# STEP 2: ANALYSIS PLANNER (SELF-DECIDING)
# --------------------------------
def plan_analysis(schema):
    prompt = f"""
You are a senior data analyst.

Based on the dataset schema below:
1. Identify dataset type
2. Identify target variable (if any)
3. Decide analyses to perform
4. Decide analyses to skip

Schema:
{schema}

Respond ONLY in JSON with:
dataset_type, target_variable, recommended_analyses, skip_analyses
"""

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return json.loads(response.choices[0].message["content"])

In [None]:
# --------------------------------
# STEP 3: HYPOTHESIS GENERATION
# --------------------------------
def generate_hypotheses(schema, target):
    prompt = f"""
Generate 3 to 5 testable hypotheses for this dataset.

Schema:
{schema}

Target variable:
{target}

Return JSON array with:
hypothesis, test_method, required_columns
"""

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return json.loads(response.choices[0].message["content"])


In [None]:
# --------------------------------
# STEP 4: DETERMINISTIC TESTS
# --------------------------------
def test_age_churn(df, target):
    df = df.dropna(subset=["Age", target])
    df["age_bucket"] = pd.cut(df["Age"], bins=[0,30,45,60,100])
    return df.groupby("age_bucket")[target].mean()

def test_correlation(df, col, target):
    df = df[[col, target]].dropna()
    return df.corr().iloc[0,1]

In [None]:
# --------------------------------
# STEP 5: HYPOTHESIS EVALUATION
# --------------------------------
def evaluate_hypotheses(df, hypotheses, target):
    results = []

    for h in hypotheses:
        try:
            if h["test_method"].lower().startswith("compare") and "age" in h["required_columns"]:
                output = test_age_churn(df, target)
                decision = "Accepted" if output.max() - output.min() > 0.1 else "Rejected"

                results.append({
                    "hypothesis": h["hypothesis"],
                    "result": decision,
                    "evidence": output.to_string()
                })

            elif h["test_method"].lower().startswith("correlation"):
                corr = test_correlation(df, h["required_columns"][0], target)
                decision = "Accepted" if abs(corr) > 0.3 else "Rejected"

                results.append({
                    "hypothesis": h["hypothesis"],
                    "result": decision,
                    "evidence": f"Correlation = {round(corr, 3)}"
                })
        except Exception as e:
            results.append({
                "hypothesis": h["hypothesis"],
                "result": "Error",
                "evidence": str(e)
            })

    return results

In [None]:
# --------------------------------
# STEP 6: INTERPRET RESULTS
# --------------------------------
def interpret_results(results):
    prompt = f"""
Interpret the following hypothesis test results.
Explain insights in business terms.

Results:
{results}
"""

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message["content"]


In [None]:
# --------------------------------
# MAIN AGENT RUNNER
# --------------------------------
def run_agent(csv_path):
    df = pd.read_csv(csv_path)

    schema = inspect_schema(df)
    plan = plan_analysis(schema)

    print("\n===== AGENT DECISION =====")
    print("Dataset Type:", plan["dataset_type"])
    print("Target Variable:", plan["target_variable"])
    print("Analyses to Run:", plan["recommended_analyses"])
    print("Analyses Skipped:", plan["skip_analyses"])

    hypotheses = generate_hypotheses(schema, plan["target_variable"])
    print("\n===== GENERATED HYPOTHESES =====")
    for h in hypotheses:
        print("-", h["hypothesis"])

    results = evaluate_hypotheses(df, hypotheses, plan["target_variable"])
    print("\n===== HYPOTHESIS RESULTS =====")
    for r in results:
        print(f"\n{r['hypothesis']}")
        print("Decision:", r["result"])
        print("Evidence:\n", r["evidence"])

    interpretation = interpret_results(results)
    print("\n===== BUSINESS INTERPRETATION =====")
    print(interpretation)


In [None]:
# --------------------------------
# ENTRY POINT
# --------------------------------
if __name__ == "__main__":
    run_agent("sample.csv")