In [None]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Any

class SchemaExtractor:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.df = None
        self.schema = {}

    def load_csv(self):
        try:
            self.df = pd.read_csv(self.file_path)
        except Exception as e:
            raise ValueError(f"Error loading CSV: {str(e)}")

    def infer_column_types(self):
        column_info = {}
        for col in self.df.columns:
            dtype = self.df[col].dtype
            non_null_count = self.df[col].notnull().sum()
            column_info[col] = {
                "dtype": str(dtype),
                "non_nulls": int(non_null_count)
            }
        self.schema["columns"] = column_info

    def detect_date_columns(self):
        date_cols = []
        for col in self.df.columns:
            try:
                parsed_col = pd.to_datetime(self.df[col], errors='coerce')
                if parsed_col.notna().sum() / len(parsed_col) > 0.8:
                    date_cols.append(col)
            except:
                pass
            if re.search(r'date|time|year|month', col, re.IGNORECASE):
                if col not in date_cols:
                    date_cols.append(col)
        self.schema["date_columns"] = date_cols

    def detect_numeric_columns(self):
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        self.schema["numeric_columns"] = numeric_cols

    def extract_sample_rows(self, n=5):
        self.schema["sample_rows"] = self.df.head(n).to_dict(orient="records")

    def generate_schema(self) -> Dict[str, Any]:
        self.load_csv()
        self.infer_column_types()
        self.detect_date_columns()
        self.detect_numeric_columns()
        self.extract_sample_rows()
        return self.schema

class SchemaFormatter:
    @staticmethod
    def format_schema(schema: Dict[str, Any]) -> str:
        lines = []
        columns = schema.get("columns", {})
        date_columns = schema.get("date_columns", [])
        numeric_columns = schema.get("numeric_columns", [])
        sample_rows = schema.get("sample_rows", [])

        lines.append("📊 The dataset contains the following columns:")
        for col, meta in columns.items():
            lines.append(f"- {col} (dtype: {meta['dtype']}, non-null: {meta['non_nulls']})")

        if numeric_columns:
            lines.append("\n🔢 Numeric columns:")
            for col in numeric_columns:
                lines.append(f"- {col}")

        if date_columns:
            lines.append("\n📅 Detected date/time columns:")
            for col in date_columns:
                lines.append(f"- {col}")

        lines.append("\n📝 Sample data rows:")
        for row in sample_rows:
            lines.append(f"- {row}")

        return "\n".join(lines)

class PromptBuilder:
    @staticmethod
    def build_prompt(schema_text: str, user_query: str) -> str:
        system_instruction = (
            "You are an advanced data analyst AI agent. You are analyzing CSV datasets that may have different schemas. "
            "You have been provided with the complete schema and some sample data rows from the file."
            "Use ONLY the columns and fields present in the provided schema when answering. "
            "If the user asks about a field that does not exist, clearly state that the field is not present."
            "Provide detailed, specific, and analytical responses based on the available data."
        )

        full_prompt = (
            f"{system_instruction}\n\n"
            f"Dataset Schema:\n{schema_text}\n\n"
            f"User Question: {user_query}\n\n"
            "Your Analysis:"
        )
        return full_prompt


In [None]:
csvFilePath = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_sales_packaging.csv"
extractor = SchemaExtractor(csvFilePath)
schema = extractor.generate_schema()

In [None]:
formatter = SchemaFormatter()
schema_text = formatter.format_schema(schema)

In [None]:
print(schema_text)

In [None]:
# --- 1. Load Environment ---
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("❌ OPENAI_API_KEY is not loaded from .env")
print("✅ API key successfully loaded")

In [None]:
user_question = "Which customer and company has been the most profitable in the last 6 months? What is the trend and explain the calculations?"
prompt = PromptBuilder.build_prompt(schema_text, user_question)

# 4️⃣ Send to LLM
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": prompt}
  ],
  temperature=0.4
)

print(response.choices[0].message.content.strip())