In [2]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

warnings.filterwarnings('ignore')

In [3]:
def clean_column_names(*columns):
    """Helper function to clean multiple column names"""
    return [str(col).strip().lower().replace("'", "").replace('"', "") for col in columns]

In [4]:
class BivariateAnalyzer:
    def __init__(self, data):
        """Initialize with pandas DataFrame"""
        self.data = data
        self.numeric_cols = set(data.select_dtypes(include=[np.number]).columns)
        self.categorical_cols = set(data.select_dtypes(exclude=[np.number]).columns)
        
    def compute_correlation_metrics(self, var1, var2):
        """Compute correlation between two numerical variables"""
        try:
            var1, var2 = clean_column_names(var1, var2)
            
            # Validate columns are numerical
            if not all(col in self.numeric_cols for col in [var1, var2]):
                return f"Error: Both variables must be numeric. Current types: {var1}: {self.data[var1].dtype}, {var2}: {self.data[var2].dtype}"
            
            data1 = self.data[var1].dropna()
            data2 = self.data[var2].dropna()
            
            # Pearson correlation
            pearson_r, pearson_p = stats.pearsonr(data1, data2)
            
            # Spearman correlation
            spearman_r, spearman_p = stats.spearmanr(data1, data2)
            
            return {
                'pearson': {
                    'correlation': f"{pearson_r:.3f}",
                    'p_value': f"{pearson_p:.3f}",
                    'significance': 'Significant' if pearson_p < 0.05 else 'Not significant'
                },
                'spearman': {
                    'correlation': f"{spearman_r:.3f}",
                    'p_value': f"{spearman_p:.3f}",
                    'significance': 'Significant' if spearman_p < 0.05 else 'Not significant'
                },
                'relationship': {
                    'strength': self._interpret_correlation(pearson_r),
                    'direction': 'positive' if pearson_r > 0 else 'negative',
                    'magnitude': abs(pearson_r)
                }
            }
        except Exception as e:
            return f"Error in correlation analysis: {str(e)}"
    
    def _interpret_correlation(self, r):
        """Interpret correlation coefficient strength"""
        abs_r = abs(r)
        if abs_r < 0.3:
            return 'weak'
        elif abs_r < 0.7:
            return 'moderate'
        else:
            return 'strong'
            
    def compute_categorical_association(self, var1, var2):
        """Compute association between categorical variables"""
        try:
            var1, var2 = clean_column_names(var1, var2)
            
            # Create contingency table
            contingency = pd.crosstab(self.data[var1], self.data[var2])
            chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
            
            # Calculate Cramer's V
            n = contingency.sum().sum()
            min_dim = min(contingency.shape) - 1
            cramer_v = np.sqrt(chi2 / (n * min_dim))
            
            # Calculate proportions
            proportions = contingency.div(contingency.sum(axis=1), axis=0)
            
            return {
                'chi_square_test': {
                    'statistic': f"{chi2:.3f}",
                    'p_value': f"{p_value:.3f}",
                    'degrees_of_freedom': dof,
                    'significance': 'Significant' if p_value < 0.05 else 'Not significant'
                },
                'association_strength': {
                    'cramers_v': f"{cramer_v:.3f}",
                    'interpretation': self._interpret_cramers_v(cramer_v)
                },
                'category_proportions': proportions.to_dict()
            }
        except Exception as e:
            return f"Error in categorical association analysis: {str(e)}"
    
    def _interpret_cramers_v(self, v):
        """Interpret Cramer's V strength"""
        if v < 0.1:
            return 'negligible'
        elif v < 0.3:
            return 'weak'
        elif v < 0.5:
            return 'moderate'
        else:
            return 'strong'
            
    def analyze_by_group(self, var1, var2, grouping_var):
        """Analyze relationship between variables within groups"""
        try:
            var1, var2, grouping_var = clean_column_names(var1, var2, grouping_var)
            
            group_stats = {}
            for group in self.data[grouping_var].unique():
                group_data = self.data[self.data[grouping_var] == group]
                
                # Create analyzer for group subset
                group_analyzer = BivariateAnalyzer(group_data)
                
                # Determine analysis type based on variable types
                if all(var in self.numeric_cols for var in [var1, var2]):
                    stats = group_analyzer.compute_correlation_metrics(var1, var2)
                else:
                    stats = group_analyzer.compute_categorical_association(var1, var2)
                    
                group_stats[str(group)] = {
                    'sample_size': len(group_data),
                    'percentage': f"{(len(group_data) / len(self.data)) * 100:.1f}%",
                    'analysis': stats
                }
            
            return group_stats
        except Exception as e:
            return f"Error in group analysis: {str(e)}"
            
    def compute_regression_metrics(self, var1, var2):
        """Compute regression analysis between two variables"""
        try:
            var1, var2 = clean_column_names(var1, var2)
            
            # Validate columns are numerical
            if not all(col in self.numeric_cols for col in [var1, var2]):
                return f"Error: Both variables must be numeric. Current types: {var1}: {self.data[var1].dtype}, {var2}: {self.data[var2].dtype}"
            
            X = self.data[var1].values.reshape(-1, 1)
            y = self.data[var2].values
            
            # Compute regression metrics
            slope, intercept, r_value, p_value, std_err = stats.linregress(X.flatten(), y)
            
            # Calculate additional metrics
            predictions = slope * X.flatten() + intercept
            residuals = y - predictions
            mse = np.mean(residuals ** 2)
            rmse = np.sqrt(mse)
            
            return {
                'model_parameters': {
                    'slope': f"{slope:.3f}",
                    'intercept': f"{intercept:.3f}",
                    'r_squared': f"{r_value ** 2:.3f}"
                },
                'statistical_significance': {
                    'p_value': f"{p_value:.3f}",
                    'std_error': f"{std_err:.3f}",
                    'significance': 'Significant' if p_value < 0.05 else 'Not significant'
                },
                'model_performance': {
                    'mse': f"{mse:.3f}",
                    'rmse': f"{rmse:.3f}",
                    'prediction_quality': self._interpret_r_squared(r_value ** 2)
                }
            }
        except Exception as e:
            return f"Error in regression analysis: {str(e)}"
    
    def _interpret_r_squared(self, r_squared):
        """Interpret R-squared value"""
        if r_squared < 0.3:
            return 'poor'
        elif r_squared < 0.6:
            return 'moderate'
        else:
            return 'good'

In [5]:

class BivariateRAG:
    def __init__(self, api_key):
        """Initialize with OpenAI API key"""
        self.llm = OpenAI(openai_api_key=api_key, temperature=0)
        self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        self.vector_store = None
        
    def setup_rag(self, pdf_path):
        """Setup RAG system with PDF document"""
        try:
            if os.path.exists(pdf_path):
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                text_splitter = CharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=200
                )
                texts = text_splitter.split_documents(documents)
                self.vector_store = FAISS.from_documents(texts, self.embeddings)
                return "RAG system initialized successfully"
            else:
                return f"Error: PDF file '{pdf_path}' not found"
        except Exception as e:
            return f"Error setting up RAG system: {str(e)}"
        
    def create_tools(self, bivariate_analyzer):
        """Create tools for the agent with improved descriptions"""
        tools = [
            Tool(
                name="Correlation Analysis",
                func=lambda x: bivariate_analyzer.compute_correlation_metrics(*x.split(',')),
                description="Analyze correlation between two numeric variables. Input: 'var1,var2' (e.g., 'age,charges')"
            ),
            Tool(
                name="Categorical Association",
                func=lambda x: bivariate_analyzer.compute_categorical_association(*x.split(',')),
                description="Analyze association between categorical variables. Input: 'var1,var2' (e.g., 'sex,smoker')"
            ),
            Tool(
                name="Group Analysis",
                func=lambda x: bivariate_analyzer.analyze_by_group(*x.split(',')),
                description="Analyze relationships within groups. Input: 'var1,var2,group' (e.g., 'age,charges,smoker')"
            ),
            Tool(
                name="Regression Analysis",
                func=lambda x: bivariate_analyzer.compute_regression_metrics(*x.split(',')),
                description="Perform regression analysis between numeric variables. Input: 'var1,var2' (e.g., 'bmi,charges')"
            ),
            Tool(
                name="RAG Query",
                func=lambda x: self.query_rag(x),
                description="Query documentation for insights about variable relationships"
            )
        ]
        return tools
        
    def setup_agent(self, tools):
        """Initialize LangChain agent with verbose output"""
        return initialize_agent(
            tools,
            self.llm,
            agent="zero-shot-react-description",
            verbose=True,
            max_iterations=5
        )
        
    def query_rag(self, query):
        """Query the RAG system with improved context handling"""
        try:
            if self.vector_store is None:
                return "RAG system not initialized. Please load a PDF first."
                
            relevant_docs = self.vector_store.similarity_search(query, k=3)
            context = "\n".join([doc.page_content for doc in relevant_docs])
            
            prompt = f"""
            Context: {context}
            
            Query: {query}
            
            Based on the context above and the insurance dataset, provide a detailed analysis of the relationships between variables:
            """
            
            return self.llm(prompt)
        except Exception as e:
            return f"Error querying RAG system: {str(e)}"

def print_formatted_results(result):
    """Helper function to format and print results"""
    if isinstance(result, dict):
        for key, value in result.items():
            print(f"\n{key.replace('_', ' ').title()}:")
            if isinstance(value, dict):
                for subkey, subvalue in value.items():
                    if isinstance(subvalue, dict):
                        print(f"  {subkey.replace('_', ' ').title()}:")
                        for subsubkey, subsubvalue in subvalue.items():
                            print(f"    {subsubkey.replace('_', ' ').title()}: {subsubvalue}")
                    else:
                        print(f"  {subkey.replace('_', ' ').title()}: {subvalue}")
            else:
                print(f"  {value}")
    else:
        print(result)

def run_bivariate_queries(bivariate_analyzer, rag_system, agent, data):
    """Run all queries including statistical analysis and RAG queries"""
    queries = [
        {
            "query": "Analyze the relationship between age and charges",
            "expected_tool": "Correlation Analysis",
            "backup_func": lambda: bivariate_analyzer.compute_correlation_metrics("age", "charges")
        },
        {
            "query": "How are smoking status and gender related?",
            "expected_tool": "Categorical Association",
            "backup_func": lambda: bivariate_analyzer.compute_categorical_association("smoker", "sex")
        },
        {
            "query": "How does the relationship between BMI and charges vary by smoking status?",
            "expected_tool": "Group Analysis",
            "backup_func": lambda: bivariate_analyzer.analyze_by_group("bmi", "charges", "smoker")
        },
        {
            "query": "Provide a regression analysis of BMI and charges",
            "expected_tool": "Regression Analysis",
            "backup_func": lambda: bivariate_analyzer.compute_regression_metrics("bmi", "charges")
        },
        {
            "query": "What factors influence the relationship between age and insurance charges?",
            "expected_tool": "RAG Query",
            "backup_func": lambda: rag_system.query_rag("How do age and charges relate in insurance pricing?")
        }
    ]

    for i, query_info in enumerate(queries, 1):
        print(f"\nQuery {i}: {query_info['query']}")
        print("-" * 80)
        
        try:
            result = agent.run(query_info['query'])
            print("\nAnalysis Result (via Agent):")
            print_formatted_results(result)
            
        except Exception as e:
            print(f"\nFalling back to direct analysis...")
            try:
                result = query_info['backup_func']()
                print("\nAnalysis Result (via Backup):")
                print_formatted_results(result)
            except Exception as backup_error:
                print(f"Analysis failed: {str(backup_error)}")
            
        print("\n" + "=" * 80)

In [6]:
def main():
    """Main function to run the entire analysis system"""
    try:
        # Load data
        data = pd.read_csv('insurance.csv')
        print("Data loaded successfully")
        
        # Initialize analyzers
        bivariate_analyzer = BivariateAnalyzer(data)
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError("OpenAI API key not found in environment variables")
        rag_system = BivariateRAG(api_key)
        print("Analyzers initialized")
        
        # Setup RAG system
        rag_status = rag_system.setup_rag('analysisRag.pdf')
        print(f"RAG setup status: {rag_status}")
        
        # Create and setup agent
        tools = rag_system.create_tools(bivariate_analyzer)
        agent = rag_system.setup_agent(tools)
        print("Agent setup completed")
        
        # Run the bivariate queries
        print("\nRunning standard bivariate analyses:")
        run_bivariate_queries(bivariate_analyzer, rag_system, agent, data)
        
        # Additional RAG queries for deeper insights
        print("\nRunning additional RAG queries for deeper insights:")
        additional_queries = [
            "What combinations of variables are most predictive of insurance charges?",
            "How do demographic factors interact with health indicators to affect pricing?",
            "What are the key patterns in the relationships between age, BMI, and charges?",
            "How does smoking status modify other variable relationships?"
        ]
        
        for i, query in enumerate(additional_queries, 1):
            print(f"\nDeep Insight Query {i}: {query}")
            print("-" * 80)
            try:
                result = rag_system.query_rag(query)
                print("\nInsight Result:")
                print(result)
                print("=" * 80)
            except Exception as e:
                print(f"Error getting insight: {str(e)}")
                
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        print("\nPlease ensure:")
        print("1. 'insurance.csv' file is in the working directory")
        print("2. 'analysis_guidelines.pdf' file is in the working directory")
        print("3. OPENAI_API_KEY environment variable is set")

if __name__ == "__main__":
    main()

Data loaded successfully


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)


Analyzers initialized
RAG setup status: RAG system initialized successfully
Agent setup completed

Running standard bivariate analyses:

Query 1: Analyze the relationship between age and charges
--------------------------------------------------------------------------------


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should use correlation analysis to see if there is a linear relationship between these two variables.
Action: Correlation Analysis
Action Input: 'age,charges'[0m
Observation: [36;1m[1;3m{'pearson': {'correlation': '0.299', 'p_value': '0.000', 'significance': 'Significant'}, 'spearman': {'correlation': '0.534', 'p_value': '0.000', 'significance': 'Significant'}, 'relationship': {'strength': 'weak', 'direction': 'positive', 'magnitude': 0.2990081933306476}}[0m
Thought:[32;1m[1;3m The correlation analysis shows a weak positive relationship between age and charges.
Action: Regression Analysis
Action Input: 'age,charges'[0m
Observation: [36;1m[1;3m