In [2]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

In [3]:
def clean_column_name(column):
    """Helper function to clean column names"""
    return str(column).strip().lower().replace("'", "").replace('"', "")

In [4]:
class UnivariateAnalyzer:
    def __init__(self, data):
        """Initialize with pandas DataFrame"""
        self.data = data
        self.numeric_cols = data.select_dtypes(include=[np.number]).columns
        self.categorical_cols = data.select_dtypes(exclude=[np.number]).columns
        
    def compute_summary_statistics(self, column):
        """Compute comprehensive summary statistics"""
        try:
            column = clean_column_name(column)
            data = self.data[column].dropna()
            
            stats_dict = {
                'mean': float(np.mean(data)),
                'median': float(np.median(data)),
                'std': float(np.std(data)),
                'var': float(np.var(data)),
                'skew': float(stats.skew(data)),
                'kurtosis': float(stats.kurtosis(data)),
                'iqr': float(stats.iqr(data)),
                'range': float(np.ptp(data)),
                'min': float(data.min()),
                'max': float(data.max()),
                'q1': float(np.percentile(data, 25)),
                'q3': float(np.percentile(data, 75)),
                'count': int(len(data)),
                'missing': int(self.data[column].isna().sum())
            }
            
            formatted_stats = {
                k: f"{v:.2f}" if isinstance(v, float) else v 
                for k, v in stats_dict.items()
            }
            
            # Add distribution type
            _, p_value = stats.normaltest(data)
            formatted_stats['distribution'] = 'Normal' if p_value > 0.05 else 'Non-normal'
            
            return formatted_stats
            
        except Exception as e:
            return f"Error computing statistics for column {column}: {str(e)}"
    
    def analyze_by_group(self, numeric_col, categorical_col):
        """Analyze numeric variables split by categorical variables"""
        try:
            numeric_col = clean_column_name(numeric_col)
            categorical_col = clean_column_name(categorical_col)
            
            if numeric_col not in self.data.columns:
                return f"Error: Column '{numeric_col}' not found in dataset"
            if categorical_col not in self.data.columns:
                return f"Error: Column '{categorical_col}' not found in dataset"
            
            grouped_stats = {}
            group_summaries = {}
            
            for category in self.data[categorical_col].unique():
                subset = self.data[self.data[categorical_col] == category][numeric_col]
                stats_dict = {
                    'count': len(subset),
                    'mean': float(np.mean(subset)),
                    'median': float(np.median(subset)),
                    'std': float(np.std(subset)),
                    'min': float(subset.min()),
                    'max': float(subset.max()),
                    'q1': float(np.percentile(subset, 25)),
                    'q3': float(np.percentile(subset, 75)),
                    'iqr': float(np.percentile(subset, 75) - np.percentile(subset, 25))
                }
                
                grouped_stats[str(category)] = {
                    k: f"{v:.2f}" if isinstance(v, float) else v 
                    for k, v in stats_dict.items()
                }
                
                group_summaries[str(category)] = {
                    'size': len(subset),
                    'percentage': f"{(len(subset) / len(self.data)) * 100:.1f}%"
                }
            
            groups = [self.data[self.data[categorical_col] == cat][numeric_col].dropna() 
                     for cat in self.data[categorical_col].unique()]
            
            if len(groups) == 2:
                stat, p_value = stats.ttest_ind(*groups)
                test_name = "t-test"
            else:
                stat, p_value = stats.f_oneway(*groups)
                test_name = "ANOVA"
            
            ss_between = sum(len(g) * ((np.mean(g) - np.mean(self.data[numeric_col])) ** 2) for g in groups)
            ss_total = sum((self.data[numeric_col] - np.mean(self.data[numeric_col])) ** 2)
            eta_squared = ss_between / ss_total if ss_total != 0 else 0
            
            return {
                'group_statistics': grouped_stats,
                'group_summaries': group_summaries,
                'between_group_comparison': {
                    'test_type': test_name,
                    'statistic': f"{stat:.3f}",
                    'p_value': f"{p_value:.3f}",
                    'significant_difference': 'Yes' if p_value < 0.05 else 'No',
                    'effect_size': f"{eta_squared:.3f}"
                }
            }
            
        except Exception as e:
            return f"Error in group analysis: {str(e)}"


In [5]:
class LLMAnalyzer:
    def __init__(self, api_key):
        """Initialize with OpenAI API key"""
        self.llm = OpenAI(openai_api_key=api_key, temperature=0)
        self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        self.vector_store = None
        
    def setup_rag(self, pdf_path):
        """Setup RAG system with PDF document"""
        try:
            if os.path.exists(pdf_path):
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                text_splitter = CharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=200
                )
                texts = text_splitter.split_documents(documents)
                self.vector_store = FAISS.from_documents(texts, self.embeddings)
                return "RAG system initialized successfully"
            else:
                return f"Error: PDF file '{pdf_path}' not found"
        except Exception as e:
            return f"Error setting up RAG system: {str(e)}"
        
    def query_rag(self, query):
        """Query the RAG system with detailed context"""
        try:
            if self.vector_store is None:
                return "RAG system not initialized. Please load a PDF first."
                
            relevant_docs = self.vector_store.similarity_search(query, k=3)
            context = "\n".join([doc.page_content for doc in relevant_docs])
            
            prompt = f"""
            Context: {context}
            
            Query: {query}
            
            Based on the context above and the insurance dataset, provide a detailed analysis:
            """
            
            response = self.llm(prompt)
            return response
            
        except Exception as e:
            return f"Error querying RAG system: {str(e)}"
        
    def create_tools(self, univariate_analyzer):
        """Create tools for the agent with improved descriptions"""
        tools = [
            Tool(
                name="Univariate Analysis",
                func=lambda x: univariate_analyzer.compute_summary_statistics(x),
                description="Get statistical analysis of a single column. Input: column name (e.g., bmi, age, charges)"
            ),
            Tool(
                name="Group Analysis",
                func=lambda x: univariate_analyzer.analyze_by_group(*[col.strip() for col in x.split(',')]),
                description="Compare statistics across groups. Input format: 'numeric_column,categorical_column' (e.g., charges,smoker)"
            ),
            Tool(
                name="RAG Query",
                func=lambda x: self.query_rag(x),
                description="Query the PDF documentation for domain knowledge and insights. Input: natural language question"
            )
        ]
        return tools
        
    def setup_agent(self, tools):
        """Initialize LangChain agent with verbose output"""
        return initialize_agent(
            tools,
            self.llm,
            agent="zero-shot-react-description",
            verbose=True,
            max_iterations=5
        )

In [6]:
# Additional helper function for running agent queries
def run_agent_queries(univariate_analyzer, llm_analyzer, agent, data):
    """Run all queries including statistical analysis and RAG queries"""
    # Statistical Analysis Queries
    queries = [
        {
            "query": "Analyze the statistics of the bmi column",
            "expected_tool": "Univariate Analysis",
            "backup_func": lambda: univariate_analyzer.compute_summary_statistics("bmi")
        },
        {
            "query": "Compare bmi statistics between different sex groups",
            "expected_tool": "Group Analysis",
            "backup_func": lambda: univariate_analyzer.analyze_by_group("bmi", "sex")
        },
        {
            "query": "What's the distribution of charges for smokers vs non-smokers?",
            "expected_tool": "Group Analysis",
            "backup_func": lambda: univariate_analyzer.analyze_by_group("charges", "smoker")
        },
        {
            "query": "Show me the age distribution statistics",
            "expected_tool": "Univariate Analysis",
            "backup_func": lambda: univariate_analyzer.compute_summary_statistics("age")
        },
    ]
    
    print("\nRunning Statistical Analysis Queries:")
    print("=" * 80)
    
    for i, query_info in enumerate(queries, 1):
        print(f"\nQuery {i}: {query_info['query']}")
        print("-" * 80)
        
        try:
            result = agent.run(query_info['query'])
            print("\nAnalysis Result (via Agent):")
            print_formatted_results(result)
            
        except Exception as e:
            print(f"\nFalling back to direct analysis...")
            try:
                result = query_info['backup_func']()
                print("\nAnalysis Result (via Backup):")
                print_formatted_results(result)
            except Exception as backup_error:
                print(f"Analysis failed: {str(backup_error)}")
            
        print("\n" + "=" * 80)

    # RAG Queries
    rag_queries = [
        "What are the key factors affecting insurance charges?",
        "How does lifestyle impact insurance premiums?",
        "What are the best practices for risk assessment in insurance?"
    ]
    
    print("\nRunning RAG Queries:")
    print("=" * 80)
    
    for i, query in enumerate(rag_queries, 1):
        print(f"\nRAG Query {i}: {query}")
        print("-" * 80)
        result = llm_analyzer.query_rag(query)
        print("\nRAG Analysis Result:")
        print(result)
        print("=" * 80)

In [7]:
def generate_integrated_report(results, filename="integrated_analysis_report.txt"):
    """Generate comprehensive report with statistical and LLM analysis"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            # Header
            f.write(f"Integrated Analysis Report\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("=" * 80 + "\n\n")
            
            # Statistical Analysis Section
            f.write("1. STATISTICAL ANALYSIS\n")
            f.write("-" * 80 + "\n\n")
            
            # Write each analysis result
            for analysis_name, result in results['statistical_results'].items():
                f.write(f"\n{analysis_name}:\n")
                f.write("-" * 40 + "\n")
                
                if isinstance(result, dict):
                    for key, value in result.items():
                        if isinstance(value, dict):
                            f.write(f"\n{key}:\n")
                            for subkey, subvalue in value.items():
                                f.write(f"  {subkey}: {subvalue}\n")
                        else:
                            f.write(f"{key}: {value}\n")
                else:
                    f.write(str(result) + "\n")
            
            # RAG Analysis Section
            f.write("\n2. RAG ANALYSIS\n")
            f.write("-" * 80 + "\n\n")
            
            for query, response in results['rag_results'].items():
                f.write(f"Query: {query}\n")
                f.write(f"Response: {response}\n\n")
            
            # Conclusions Section
            f.write("\n3. CONCLUSIONS\n")
            f.write("-" * 80 + "\n\n")
            
            # Generate and write conclusions
            conclusions = generate_conclusions(results)
            f.write(conclusions)
            
            f.write("\n\n" + "=" * 80 + "\n")
        
        return "Integrated report generated successfully"
        
    except Exception as e:
        return f"Error generating report: {str(e)}"

In [8]:
def generate_conclusions(results):
    """Generate conclusions based on the analysis results"""
    conclusions = []
    
    # Analyze BMI distribution
    if 'BMI Analysis' in results['statistical_results']:
        bmi_stats = results['statistical_results']['BMI Analysis']
        conclusions.append(f"BMI Distribution: The average BMI is {bmi_stats.get('mean', 'N/A')} "
                         f"with a standard deviation of {bmi_stats.get('std', 'N/A')}.")
    
    # Analyze group differences
    if 'Sex Group Comparison' in results['statistical_results']:
        group_comparison = results['statistical_results']['Sex Group Comparison']
        if 'between_group_comparison' in group_comparison:
            comparison = group_comparison['between_group_comparison']
            conclusions.append(f"Group Differences: {comparison.get('test_type', 'Statistical')} analysis shows "
                             f"{'significant' if comparison.get('significant_difference') == 'Yes' else 'no significant'} "
                             f"differences between groups (p-value: {comparison.get('p_value', 'N/A')}).")
    
    # Add overall conclusion
    conclusions.append("\nOverall Assessment:")
    conclusions.append("The analysis reveals patterns in insurance data that can be valuable for risk assessment "
                      "and policy pricing. Further investigation of specific factors may be warranted based on "
                      "the observed relationships.")
    
    return "\n".join(conclusions)

In [9]:
def run_analysis_with_report(univariate_analyzer, llm_analyzer, agent):
    """Run analysis and generate report"""
    results = {
        'statistical_results': {},
        'rag_results': {}
    }
    
    # Run statistical analyses
    results['statistical_results']['BMI Analysis'] = univariate_analyzer.compute_summary_statistics('bmi')
    results['statistical_results']['Sex Group Comparison'] = univariate_analyzer.analyze_by_group('bmi', 'sex')
    results['statistical_results']['Smoker Charges Analysis'] = univariate_analyzer.analyze_by_group('charges', 'smoker')
    results['statistical_results']['Age Distribution'] = univariate_analyzer.compute_summary_statistics('age')
    
    # Run RAG queries
    rag_queries = [
        "What are the key factors affecting insurance charges?",
        "How does lifestyle impact insurance premiums?",
        "What are the best practices for risk assessment in insurance?"
    ]
    
    for query in rag_queries:
        results['rag_results'][query] = llm_analyzer.query_rag(query)
    
    # Generate report
    report_status = generate_integrated_report(results)
    print(report_status)
    
    return results

In [10]:
def main():
    """Main function to run the entire analysis system"""
    try:
        # Load data
        data = pd.read_csv('insurance.csv')
        print("Data loaded successfully")
        
        # Initialize analyzers
        univariate_analyzer = UnivariateAnalyzer(data)
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError("OpenAI API key not found in environment variables")
        llm_analyzer = LLMAnalyzer(api_key)
        print("Analyzers initialized")
        
        # Setup RAG system
        rag_status = llm_analyzer.setup_rag('analysisRag.pdf')
        print(f"RAG setup status: {rag_status}")
        
        # Create and setup agent
        tools = llm_analyzer.create_tools(univariate_analyzer)
        agent = llm_analyzer.setup_agent(tools)
        print("Agent setup completed")
        
        # Run analysis and generate report
        results = run_analysis_with_report(univariate_analyzer, llm_analyzer, agent)
        print("Analysis completed and report generated")
        # Run the queries
        run_agent_queries(univariate_analyzer, llm_analyzer, agent, data)
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        print("\nPlease ensure:")
        print("1. 'insurance.csv' file is in the working directory")
        print("2. 'analysis_guidelines.pdf' file is in the working directory")
        print("3. OPENAI_API_KEY environment variable is set")

if __name__ == "__main__":
    main()

Data loaded successfully


  self.llm = OpenAI(openai_api_key=api_key, temperature=0)
  self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)


Analyzers initialized
RAG setup status: RAG system initialized successfully
Agent setup completed


  return initialize_agent(
  response = self.llm(prompt)


Integrated report generated successfully
Analysis completed and report generated

Running Statistical Analysis Queries:

Query 1: Analyze the statistics of the bmi column
--------------------------------------------------------------------------------


[1m> Entering new AgentExecutor chain...[0m


  result = agent.run(query_info['query'])


[32;1m[1;3m I should use univariate analysis to get the statistics of a single column
Action: Univariate Analysis
Action Input: bmi[0m
Observation: [36;1m[1;3m{'mean': '30.66', 'median': '30.40', 'std': '6.10', 'var': '37.16', 'skew': '0.28', 'kurtosis': '-0.06', 'iqr': '8.40', 'range': '37.17', 'min': '15.96', 'max': '53.13', 'q1': '26.30', 'q3': '34.69', 'count': 1338, 'missing': 0, 'distribution': 'Non-normal'}[0m
Thought:[32;1m[1;3m I now know the statistics of the bmi column
Final Answer: The mean bmi is 30.66, the median bmi is 30.40, and the standard deviation is 6.10.[0m

[1m> Finished chain.[0m

Analysis Result (via Agent):

Falling back to direct analysis...

Analysis Result (via Backup):
Analysis failed: name 'print_formatted_results' is not defined


Query 2: Compare bmi statistics between different sex groups
--------------------------------------------------------------------------------


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should use