In [None]:
print('Setup complete.')

# AI-Powered Data Pipelines

## Learning Objectives
- See AI transform raw data into business insights automatically
- Watch intelligent data processing, cleaning, and analysis
- Understand AI-driven data quality and validation
- Learn about scalable, self-healing data workflows

## The Demo: Intelligent Data Processing

We'll demonstrate how AI can:
1. **Analyze** raw data and understand its structure
2. **Clean** and standardize messy datasets
3. **Validate** data quality and detect anomalies
4. **Transform** data for business analysis
5. **Generate** insights and recommendations

In [None]:
# Setup and imports
!pip install asksageclient pip_system_certs
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import time
import tiktoken
from pathlib import Path
from typing import Dict, List, Any

# Import our AskSage client
from asksageclient import AskSageClient

# Get API credentials from Google Colab secrets
from google.colab import userdata
api_key = userdata.get('ASKSAGE_API_KEY')
email = userdata.get('ASKSAGE_EMAIL')

# Initialize client and tokenizer
client = AskSageClient(api_key=api_key, email=email)
tokenizer = tiktoken.encoding_for_model("gpt-4")
print("AskSage client initialized successfully")
print("Ready to showcase AI capabilities...")

## Sample Dataset: Messy Sales Data

Let's start with a realistic messy dataset that needs intelligent processing:

In [None]:
# Create sample messy sales data
messy_data = [
    {"date": "2024-01-15", "customer": "ACME Corp", "product": "Widget A", "amount": "$1,250.00", "region": "North", "sales_rep": "John Smith"},
    {"date": "01/16/2024", "customer": "acme corp", "product": "widget-a", "amount": "1250", "region": "NORTH", "sales_rep": "J. Smith"},
    {"date": "2024-1-17", "customer": "Beta Inc.", "product": "Widget B", "amount": "$2,500.50", "region": "South", "sales_rep": "Sarah Johnson"},
    {"date": "invalid_date", "customer": "Gamma LLC", "product": "Widget C", "amount": "ERROR", "region": "East", "sales_rep": "Mike Davis"},
    {"date": "2024-01-19", "customer": "Delta Systems", "product": "Widget A", "amount": "$999.99", "region": "West", "sales_rep": "Lisa Brown"},
    {"date": "2024/01/20", "customer": "EPSILON TECH", "product": "widget_b", "amount": "3000.00", "region": "north", "sales_rep": "Tom Wilson"},
    {"date": "2024-01-21", "customer": "", "product": "Widget D", "amount": "$0.00", "region": "Central", "sales_rep": ""}
]

# Convert to DataFrame for display
df_raw = pd.DataFrame(messy_data)
print("Raw Messy Sales Data:")
print(df_raw.to_string(index=False))
print(f"\nDataset Issues:")
print(f"- Inconsistent date formats")
print(f"- Mixed case customer names")
print(f"- Inconsistent product naming")
print(f"- Various amount formats")
print(f"- Missing/empty values")
print(f"- Invalid data entries")

## Step 1: AI Data Analysis

First, let's have AI analyze the data structure and identify issues:

In [None]:
# AI analyzes the raw data
analysis_prompt = f"""
Analyze this sales dataset and identify data quality issues, patterns, and cleaning requirements.

Raw Data:
{json.dumps(messy_data, indent=2)}

Provide analysis in JSON format:
{{
  "data_summary": {{
    "total_records": "number",
    "columns": ["list of columns"],
    "date_range": "string"
  }},
  "quality_issues": [
    {{
      "column": "string",
      "issue": "string",
      "severity": "High|Medium|Low",
      "affected_records": "number"
    }}
  ],
  "cleaning_recommendations": [
    {{
      "action": "string",
      "column": "string",
      "method": "string"
    }}
  ],
  "business_insights": ["list of observations"]
}}
"""

print("=== AI DATA ANALYSIS ===")
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

analysis_response = client.query(
    message=analysis_prompt,
    system_prompt="You are concise.",
    temperature=0.1,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


analysis_result = analysis_response.get("message").strip()
print(analysis_result)

# Parse the analysis
import re
json_match = re.search(r'\{.*\}', analysis_result, re.DOTALL)
if json_match:
    analysis_data = json.loads(json_match.group())
    print("\n✓ AI successfully analyzed the dataset")
    print(f"✓ Identified {len(analysis_data.get('quality_issues', []))} quality issues")
    print(f"✓ Generated {len(analysis_data.get('cleaning_recommendations', []))} cleaning recommendations")

## Step 2: AI Data Cleaning

Now let's have AI generate a cleaning strategy and clean the data:

In [None]:
# AI generates cleaning transformations
cleaning_prompt = f"""
Based on the messy sales data, generate Python code to clean and standardize it.

Raw Data:
{json.dumps(messy_data, indent=2)}

Requirements:
1. Standardize date formats to YYYY-MM-DD
2. Normalize customer names (title case, consistent)
3. Standardize product names
4. Convert amounts to float values
5. Standardize region names
6. Handle missing/invalid data

Provide the cleaned data in JSON format with the same structure.
Also include a summary of transformations applied.

Format:
{{
  "cleaned_data": [cleaned records],
  "transformations": [
    {{
      "field": "string",
      "action": "string",
      "records_affected": "number"
    }}
  ],
  "data_quality_score": "number 1-10"
}}
"""

print("=== AI DATA CLEANING ===")
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

cleaning_response = client.query(
    message=cleaning_prompt,
    system_prompt="You are concise.",
    temperature=0.1,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


cleaning_result = cleaning_response.get("message").strip()
print(cleaning_result[:1000] + "..." if len(cleaning_result) > 1000 else cleaning_result)

# Parse cleaned data
json_match = re.search(r'\{.*\}', cleaning_result, re.DOTALL)
if json_match:
    cleaned_response = json.loads(json_match.group())
    cleaned_data = cleaned_response.get('cleaned_data', [])
    transformations = cleaned_response.get('transformations', [])
    quality_score = cleaned_response.get('data_quality_score', 0)
    
    print(f"\n✓ Data cleaning completed")
    print(f"✓ Quality score improved to: {quality_score}/10")
    print(f"✓ Applied {len(transformations)} transformations")
    
    # Display cleaned data
    df_clean = pd.DataFrame(cleaned_data)
    print("\nCleaned Data:")
    print(df_clean.to_string(index=False))

## Step 3: AI Data Validation

Let's have AI validate the cleaned data and detect any remaining issues:

In [None]:
# AI validates the cleaned data
validation_prompt = f"""
Validate this cleaned sales dataset for business rules and data quality.

Cleaned Data:
{json.dumps(cleaned_data, indent=2)}

Validation Rules:
- Dates should be valid and recent
- Customer names should not be empty
- Product names should follow standard format
- Amounts should be positive numbers
- Regions should be valid business regions
- Sales reps should be assigned

Provide validation results:
{{
  "validation_summary": {{
    "total_records": "number",
    "valid_records": "number",
    "invalid_records": "number",
    "overall_quality": "Excellent|Good|Fair|Poor"
  }},
  "validation_issues": [
    {{
      "record_index": "number",
      "field": "string",
      "issue": "string",
      "severity": "Critical|Warning|Info"
    }}
  ],
  "business_metrics": {{
    "total_sales": "number",
    "average_deal_size": "number",
    "top_region": "string",
    "date_range": "string"
  }}
}}
"""

print("=== AI DATA VALIDATION ===")
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

validation_response = client.query(
    message=validation_prompt,
    system_prompt="You are concise.",
    temperature=0.1,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


validation_result = validation_response.get("message").strip()
print(validation_result)

# Parse validation results
json_match = re.search(r'\{.*\}', validation_result, re.DOTALL)
if json_match:
    validation_data = json.loads(json_match.group())
    summary = validation_data.get('validation_summary', {})
    issues = validation_data.get('validation_issues', [])
    metrics = validation_data.get('business_metrics', {})
    
    print(f"\n✓ Validation completed")
    print(f"✓ Data quality: {summary.get('overall_quality', 'Unknown')}")
    print(f"✓ Valid records: {summary.get('valid_records', 0)}/{summary.get('total_records', 0)}")
    print(f"✓ Issues found: {len(issues)}")
    
    if metrics:
        print(f"\nBusiness Metrics:")
        print(f"- Total Sales: ${metrics.get('total_sales', 0):,.2f}")
        print(f"- Average Deal: ${metrics.get('average_deal_size', 0):,.2f}")
        print(f"- Top Region: {metrics.get('top_region', 'Unknown')}")

## Step 4: AI Business Intelligence

Finally, let's have AI generate business insights and recommendations:

In [None]:
# AI generates business insights
insights_prompt = f"""
Analyze this cleaned sales data and generate business insights and recommendations.

Sales Data:
{json.dumps(cleaned_data, indent=2)}

Generate comprehensive business intelligence:
{{
  "executive_summary": "string",
  "key_insights": [
    {{
      "insight": "string",
      "impact": "High|Medium|Low",
      "data_supporting": "string"
    }}
  ],
  "performance_analysis": {{
    "top_performers": ["list"],
    "growth_opportunities": ["list"],
    "risk_areas": ["list"]
  }},
  "recommendations": [
    {{
      "action": "string",
      "priority": "High|Medium|Low",
      "expected_impact": "string",
      "timeline": "string"
    }}
  ],
  "next_steps": ["list of immediate actions"]
}}
"""

print("=== AI BUSINESS INTELLIGENCE ===")
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

insights_response = client.query(
    message=insights_prompt,
    system_prompt="You are concise.",
    temperature=0.2,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


insights_result = insights_response.get("message").strip()
print(insights_result)

# Parse insights
json_match = re.search(r'\{.*\}', insights_result, re.DOTALL)
if json_match:
    insights_data = json.loads(json_match.group())
    
    print(f"\n✓ Business intelligence generated")
    print(f"✓ Key insights: {len(insights_data.get('key_insights', []))}")
    print(f"✓ Recommendations: {len(insights_data.get('recommendations', []))}")
    print(f"✓ Next steps: {len(insights_data.get('next_steps', []))}")
    
    # Show high-priority recommendations
    high_priority = [r for r in insights_data.get('recommendations', []) if r.get('priority') == 'High']
    if high_priority:
        print(f"\nHigh-Priority Actions:")
        for rec in high_priority:
            print(f"- {rec.get('action', 'Unknown action')}")
            print(f"  Impact: {rec.get('expected_impact', 'Not specified')}")

## Pipeline Summary: The AI Transformation

### What We Accomplished:

**1. Intelligent Data Analysis**
- AI automatically identified data quality issues
- Recognized patterns and inconsistencies
- Generated cleaning recommendations

**2. Automated Data Cleaning**
- Standardized date formats across multiple variations
- Normalized customer and product names
- Converted currency strings to numeric values
- Handled missing and invalid data intelligently

**3. Comprehensive Validation**
- Applied business rules automatically
- Calculated data quality scores
- Generated business metrics

**4. Business Intelligence Generation**
- Extracted actionable insights from clean data
- Identified performance patterns and opportunities
- Generated prioritized recommendations

### Business Value:
- **Time Savings**: Hours of manual work → Minutes of AI processing
- **Consistency**: Human variance → Standardized quality
- **Scalability**: Handle datasets of any size
- **Intelligence**: Not just cleaning, but understanding and insights

### Traditional vs AI Approach:
- **Traditional**: Manual rules, static processes, limited insights
- **AI-Powered**: Adaptive intelligence, contextual understanding, business insights

This demonstrates how AI transforms data pipelines from simple ETL to intelligent, self-adapting business intelligence systems.