In [4]:
# Excel Analytics Chatbot using Open-Source LLM
# Run this in Google Colab for best results

# Install required packages
!pip install transformers torch gradio pandas openpyxl xlrd sentence-transformers faiss-cpu

import pandas as pd
import numpy as np
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import json
import io
import base64
from typing import List, Dict, Any

class ExcelAnalyticsChatbot:
    def __init__(self):
        # Initialize the open-source LLM
        print("Loading language model...")

        # Option 1: Use GPT-2 (more reliable for text generation)
        model_name = "gpt2"

        # Option 2: Use a smaller conversational model
        # model_name = "microsoft/DialoGPT-small"

        # Option 3: For code/analytical tasks (uncomment to use)
        # model_name = "Salesforce/codegen-350M-mono"

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(model_name)

            # Properly configure padding token
            if self.tokenizer.pad_token is None:
                if model_name == "gpt2":
                    self.tokenizer.pad_token = self.tokenizer.eos_token
                else:
                    self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                    self.model.resize_token_embeddings(len(self.tokenizer))

            # Set model to evaluation mode
            self.model.eval()

            print(f"✅ Model {model_name} loaded successfully!")

        except Exception as e:
            print(f"Error loading model: {e}")
            print("Falling back to rule-based analysis...")
            self.tokenizer = None
            self.model = None

        # Initialize sentence transformer for semantic search
        print("Loading sentence transformer...")
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Data storage
        self.df = None
        self.data_summary = None
        self.column_info = None
        self.embeddings = None
        self.faiss_index = None

        print("Chatbot initialized successfully!")

    def load_excel_file(self, file_path):
        """Load and analyze Excel/CSV file"""
        try:
            # Read file based on extension
            file_ext = file_path.lower().split('.')[-1]

            if file_ext in ['xlsx', 'xls']:
                self.df = pd.read_excel(file_path)
                print(f"✅ Excel file loaded: {self.df.shape}")

            elif file_ext == 'csv':
                # Enhanced CSV loading with multiple attempts
                print("🔄 Attempting to load CSV file...")

                # Try different configurations
                csv_configs = [
                    {'encoding': 'utf-8', 'sep': ','},
                    {'encoding': 'utf-8', 'sep': ';'},
                    {'encoding': 'latin-1', 'sep': ','},
                    {'encoding': 'cp1252', 'sep': ','},
                    {'encoding': 'utf-8', 'sep': '\t'},
                    {'encoding': 'utf-8', 'sep': '|'},
                ]

                loaded = False
                for i, config in enumerate(csv_configs):
                    try:
                        print(f"Trying config {i+1}: {config}")
                        self.df = pd.read_csv(file_path, **config)

                        # Validate the loaded data
                        if (self.df.shape[1] > 1 and
                            self.df.shape[0] > 0 and
                            not self.df.columns.str.contains(';').any()):  # Check if separator was wrong

                            print(f"✅ CSV loaded successfully with config {i+1}: {self.df.shape}")
                            loaded = True
                            break
                    except Exception as e:
                        print(f"Config {i+1} failed: {str(e)[:50]}...")
                        continue

                if not loaded:
                    # Final attempt with pandas auto-detection
                    print("🔄 Trying pandas auto-detection...")
                    self.df = pd.read_csv(file_path)

            else:
                return "❌ Error: Please upload an Excel (.xlsx, .xls) or CSV file."

            # Validate that we have meaningful data
            if self.df.empty:
                return "❌ Error: The uploaded file appears to be empty."

            if self.df.shape[1] == 1:
                return "⚠️ Warning: Only one column detected. Please check if the CSV separator is correct."

            # Clean and prepare data
            print("🔄 Cleaning data...")

            # Clean column names
            original_columns = list(self.df.columns)
            self.df.columns = self.df.columns.astype(str).str.strip().str.replace('\n', ' ').str.replace('\r', ' ')

            # Remove completely empty rows and columns
            self.df = self.df.dropna(how='all').dropna(axis=1, how='all')

            print(f"📊 Final dataset shape: {self.df.shape}")
            print(f"📋 Columns: {list(self.df.columns)}")

            # Generate data summary
            self.analyze_data()

            return f"""✅ File loaded successfully!

**Dataset Information:**
- **File Type:** {file_ext.upper()}
- **Shape:** {self.df.shape[0]} rows × {self.df.shape[1]} columns
- **Columns:** {', '.join(list(self.df.columns)[:5])}{'...' if len(self.df.columns) > 5 else ''}

**Sample Data Preview:**
{self.df.head(2).to_string()}

You can now ask questions about your data! 🚀"""

        except Exception as e:
            error_msg = str(e)
            return f"""❌ Error loading file: {error_msg}

**Troubleshooting Tips:**
- Ensure the file is not corrupted or password-protected
- For CSV files, try saving with UTF-8 encoding
- Check if the file has proper column headers
- Make sure the file contains actual data (not just headers)
- Try opening the file in Excel/LibreOffice first to verify it's readable

**Supported formats:** .xlsx, .xls, .csv"""

    def analyze_data(self):
        """Analyze the loaded data and create searchable embeddings"""
        if self.df is None:
            return

        # Basic data analysis
        self.data_summary = {
            'shape': self.df.shape,
            'columns': list(self.df.columns),
            'dtypes': self.df.dtypes.to_dict(),
            'null_counts': self.df.isnull().sum().to_dict(),
            'numeric_summary': self.df.describe().to_dict() if len(self.df.select_dtypes(include=[np.number]).columns) > 0 else {},
            'categorical_summary': {}
        }

        # Analyze categorical columns
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            unique_values = self.df[col].value_counts().head(10)
            self.data_summary['categorical_summary'][col] = unique_values.to_dict()

        # Create text representations for semantic search
        text_representations = []

        # Add column information
        for col in self.df.columns:
            col_info = f"Column: {col}, Type: {self.df[col].dtype}"
            if col in self.data_summary['numeric_summary']:
                stats = self.data_summary['numeric_summary'][col]
                col_info += f", Mean: {stats.get('mean', 'N/A')}, Max: {stats.get('max', 'N/A')}, Min: {stats.get('min', 'N/A')}"
            text_representations.append(col_info)

        # Add sample data representations
        for idx, row in self.df.head(10).iterrows():
            row_text = f"Row {idx}: " + ", ".join([f"{col}={val}" for col, val in row.items()])
            text_representations.append(row_text)

        # Create embeddings
        self.embeddings = self.sentence_model.encode(text_representations)

        # Create FAISS index for fast similarity search
        dimension = self.embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dimension)
        self.faiss_index.add(self.embeddings.astype('float32'))

        self.text_representations = text_representations

    def get_relevant_context(self, query: str, top_k: int = 5) -> str:
        """Get relevant context from the data based on the query"""
        if self.faiss_index is None:
            return ""

        # Encode query
        query_embedding = self.sentence_model.encode([query])

        # Search for similar content
        scores, indices = self.faiss_index.search(query_embedding.astype('float32'), top_k)

        # Get relevant text
        relevant_texts = [self.text_representations[idx] for idx in indices[0]]

        return "\n".join(relevant_texts)

    def generate_insights(self, query: str) -> str:
        """Generate analytical insights based on the query"""
        if self.df is None:
            return "Please upload an Excel file first."

        # Get relevant context
        context = self.get_relevant_context(query)

        # If LLM is not available, use rule-based analysis
        if self.model is None or self.tokenizer is None:
            return self.generate_data_driven_response(query)

        # Create a comprehensive prompt
        prompt = f"""Data Analysis Query:

Dataset Information:
- Rows: {self.data_summary['shape'][0]}, Columns: {self.data_summary['shape'][1]}
- Column Names: {', '.join(self.data_summary['columns'][:5])}{'...' if len(self.data_summary['columns']) > 5 else ''}

Context: {context[:200]}...

Question: {query}

Analysis:"""

        try:
            # Generate response using the LLM with proper attention mask
            encoded = self.tokenizer(
                prompt,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=400,  # Reduced for faster processing
                return_attention_mask=True
            )

            input_ids = encoded['input_ids']
            attention_mask = encoded['attention_mask']

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=input_ids.shape[1] + 100,  # Shorter response
                    num_return_sequences=1,
                    temperature=0.8,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=2,  # Avoid repetition
                    early_stopping=True
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the generated part
            if "Analysis:" in response:
                response = response.split("Analysis:")[-1].strip()
            else:
                response = response[len(prompt):].strip()

            # If the response is too short or doesn't make sense, use data-driven approach
            if len(response) < 20 or response.count('.') < 1:
                return self.generate_data_driven_response(query)

            # Combine LLM response with data-driven insights
            data_insights = self.generate_data_driven_response(query)
            return f"🤖 **AI Analysis:**\n{response}\n\n📊 **Data-Driven Insights:**\n{data_insights}"

        except Exception as e:
            print(f"LLM generation error: {e}")
            return self.generate_data_driven_response(query)

    def generate_data_driven_response(self, query: str) -> str:
        """Generate response using direct data analysis"""
        query_lower = query.lower()

        # Handle different types of queries
        if any(word in query_lower for word in ['summary', 'overview', 'describe']):
            return self.get_data_summary()

        elif any(word in query_lower for word in ['correlation', 'correlate']):
            return self.get_correlation_analysis()

        elif any(word in query_lower for word in ['missing', 'null', 'empty']):
            return self.get_missing_data_analysis()

        elif any(word in query_lower for word in ['distribution', 'histogram']):
            return self.get_distribution_analysis()

        elif 'trend' in query_lower:
            return self.get_trend_analysis()

        else:
            return self.get_general_insights(query)

    def get_data_summary(self) -> str:
        """Get comprehensive data summary"""
        summary = f"📊 **Data Summary**\n\n"
        summary += f"**Dataset Shape:** {self.df.shape[0]} rows × {self.df.shape[1]} columns\n\n"

        summary += "**Column Information:**\n"
        for col, dtype in self.data_summary['dtypes'].items():
            null_count = self.data_summary['null_counts'][col]
            summary += f"- {col}: {dtype} ({null_count} missing values)\n"

        if self.data_summary['numeric_summary']:
            summary += "\n**Numeric Columns Statistics:**\n"
            for col, stats in self.data_summary['numeric_summary'].items():
                summary += f"- {col}: Mean={stats.get('mean', 0):.2f}, Std={stats.get('std', 0):.2f}\n"

        return summary

    def get_correlation_analysis(self) -> str:
        """Analyze correlations between numeric columns"""
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns

        if len(numeric_cols) < 2:
            return "Not enough numeric columns for correlation analysis."

        corr_matrix = self.df[numeric_cols].corr()

        # Find strongest correlations
        correlations = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if not np.isnan(corr_val):
                    correlations.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))

        correlations.sort(key=lambda x: abs(x[2]), reverse=True)

        result = "🔗 **Correlation Analysis**\n\n"
        result += "**Strongest Correlations:**\n"

        for col1, col2, corr in correlations[:5]:
            strength = "Strong" if abs(corr) > 0.7 else "Moderate" if abs(corr) > 0.3 else "Weak"
            direction = "positive" if corr > 0 else "negative"
            result += f"- {col1} ↔ {col2}: {corr:.3f} ({strength} {direction})\n"

        return result

    def get_missing_data_analysis(self) -> str:
        """Analyze missing data patterns"""
        missing_data = self.df.isnull().sum()
        missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

        if missing_data.empty:
            return "✅ No missing data found in the dataset!"

        result = "🔍 **Missing Data Analysis**\n\n"
        total_rows = len(self.df)

        for col, count in missing_data.items():
            percentage = (count / total_rows) * 100
            result += f"- {col}: {count} missing ({percentage:.1f}%)\n"

        return result

    def get_distribution_analysis(self) -> str:
        """Analyze data distributions"""
        result = "📈 **Distribution Analysis**\n\n"

        numeric_cols = self.df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols[:3]:  # Analyze first 3 numeric columns
            series = self.df[col].dropna()
            result += f"**{col}:**\n"
            result += f"- Range: {series.min():.2f} to {series.max():.2f}\n"
            result += f"- Median: {series.median():.2f}\n"
            result += f"- Skewness: {series.skew():.2f}\n\n"

        return result

    def get_trend_analysis(self) -> str:
        """Basic trend analysis"""
        result = "📊 **Trend Analysis**\n\n"

        # Look for date columns
        date_cols = self.df.select_dtypes(include=['datetime64']).columns

        if len(date_cols) == 0:
            # Try to find columns that might be dates
            potential_date_cols = [col for col in self.df.columns if 'date' in col.lower() or 'time' in col.lower()]
            if potential_date_cols:
                result += f"Potential date columns found: {', '.join(potential_date_cols)}\n"
                result += "Consider converting these to datetime format for trend analysis.\n"
            else:
                result += "No date/time columns found for trend analysis.\n"
        else:
            result += f"Date columns available: {', '.join(date_cols)}\n"
            result += "Trend analysis can be performed on time-series data.\n"

        return result

    def get_general_insights(self, query: str) -> str:
        """Generate general insights based on the query"""
        result = "💡 **General Insights**\n\n"

        # Try to find relevant columns based on query keywords
        query_words = query.lower().split()
        relevant_cols = []

        for word in query_words:
            for col in self.df.columns:
                if word in col.lower():
                    relevant_cols.append(col)

        if relevant_cols:
            result += f"Found relevant columns: {', '.join(set(relevant_cols))}\n\n"

            for col in set(relevant_cols):
                if self.df[col].dtype in ['int64', 'float64']:
                    result += f"**{col}:** Mean = {self.df[col].mean():.2f}, Std = {self.df[col].std():.2f}\n"
                else:
                    top_values = self.df[col].value_counts().head(3)
                    result += f"**{col}:** Top values = {dict(top_values)}\n"
        else:
            result += "I'd be happy to help analyze your data! Try asking about:\n"
            result += "- Data summary or overview\n"
            result += "- Correlations between columns\n"
            result += "- Missing data analysis\n"
            result += "- Distribution of specific columns\n"

        return result

# Initialize the chatbot
chatbot = ExcelAnalyticsChatbot()

def process_file_and_query(file, query):
    """Process uploaded file and answer query"""
    if file is not None:
        try:
            # Get file path and extension
            file_path = file.name
            file_ext = file_path.lower().split('.')[-1]

            # Debug info
            print(f"Processing file: {file_path}, Extension: {file_ext}")

            # Load the file
            load_result = chatbot.load_excel_file(file_path)

            if query.strip():
                # Answer the query
                response = chatbot.generate_insights(query)
                return f"{load_result}\n\n---\n\n**Your Question:** {query}\n\n**Answer:**\n{response}"
            else:
                return load_result

        except Exception as e:
            return f"Error processing file: {str(e)}\n\nPlease make sure you've uploaded a valid Excel (.xlsx, .xls) or CSV file."
    else:
        if query.strip():
            return chatbot.generate_insights(query)
        else:
            return "Please upload an Excel/CSV file and/or ask a question about your data."

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Excel Analytics Chatbot", theme=gr.themes.Soft()) as interface:
        gr.Markdown("""
        # 📊 Excel Analytics Chatbot

        Upload your Excel/CSV file and ask questions about your data! This chatbot uses open-source LLMs to provide analytical insights.

        **Supported File Types:** .xlsx, .xls, .csv

        **Example Questions:**
        - "Give me a summary of this data"
        - "What are the correlations between columns?"
        - "Which columns have missing data?"
        - "Show me the distribution of [column name]"
        - "What trends can you identify?"

        **CSV Troubleshooting:** If your CSV doesn't load properly, make sure it's saved with UTF-8 encoding and uses comma separators.
        """)

        with gr.Row():
            with gr.Column(scale=1):
                file_upload = gr.File(
                    label="📁 Upload Excel/CSV File",
                    file_types=[".xlsx", ".xls", ".csv"]
                )

                # Add a test CSV button
                test_csv_btn = gr.Button("🧪 Test with Sample CSV", variant="secondary", size="sm")

            with gr.Column(scale=2):
                query_input = gr.Textbox(
                    label="❓ Ask a question about your data",
                    placeholder="e.g., 'What are the main patterns in this data?'",
                    lines=3
                )

        submit_btn = gr.Button("🔍 Analyze Data", variant="primary", size="lg")

        output = gr.Textbox(
            label="📋 Analysis Results",
            lines=15,
            max_lines=30
        )

        # Function to create and test with sample CSV
        def create_test_csv():
            import tempfile
            import os

            # Create sample data
            sample_data = {
                'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Tablet'],
                'Price': [999.99, 25.50, 75.00, 299.99, 449.99],
                'Sales': [150, 500, 300, 120, 200],
                'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
                'Rating': [4.5, 4.2, 4.0, 4.8, 4.3]
            }

            df = pd.DataFrame(sample_data)

            # Create temporary file
            temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
            df.to_csv(temp_file.name, index=False)
            temp_file.close()

            return temp_file.name

        def test_with_sample_csv():
            try:
                csv_file = create_test_csv()
                result = chatbot.load_excel_file(csv_file)

                # Clean up
                import os
                os.unlink(csv_file)

                return result + "\n\n🎯 **Sample loaded successfully!** You can now test queries like 'What is the average price?' or 'Show correlation between price and sales'."
            except Exception as e:
                return f"Error creating sample CSV: {str(e)}"

        # Examples
        gr.Examples(
            examples=[
                [None, "Give me a summary of the data"],
                [None, "What correlations exist between numeric columns?"],
                [None, "Which columns have missing values?"],
                [None, "Analyze the distribution of the data"],
                [None, "What insights can you provide about this dataset?"],
                [None, "Show me statistics for all numeric columns"],
                [None, "What are the unique values in categorical columns?"]
            ],
            inputs=[file_upload, query_input]
        )

        submit_btn.click(
            fn=process_file_and_query,
            inputs=[file_upload, query_input],
            outputs=output
        )

        test_csv_btn.click(
            fn=test_with_sample_csv,
            outputs=output
        )

    return interface

# Launch the interface
if __name__ == "__main__":
    interface = create_interface()
    interface.launch(share=True, debug=True)

Loading language model...
✅ Model gpt2 loaded successfully!
Loading sentence transformer...
Chatbot initialized successfully!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e001e15c71e35a803d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing file: /tmp/gradio/f1ac34adc9959754c44f14f901416071eef3497736a176c94d0d91441ea885ca/sales_data.csv, Extension: csv
🔄 Attempting to load CSV file...
Trying config 1: {'encoding': 'utf-8', 'sep': ','}
✅ CSV loaded successfully with config 1: (4, 13)
🔄 Cleaning data...
📊 Final dataset shape: (4, 13)
📋 Columns: ['1', '2024-01-01', 'Alice Shah', 'alice@example.com', '9876543210', 'Laptop', 'Electronics', '1.1', '60000.0', '60000.0.1', 'Credit Card', 'Mumbai, MH', 'Delivered']


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing file: /tmp/gradio/e1dfc3e1f9f864339749d73d86f43c45bde46dbe35f2c47385415bc9bcab4f0d/sales_data_sample copy.csv, Extension: csv
🔄 Attempting to load CSV file...
Trying config 1: {'encoding': 'utf-8', 'sep': ','}
Config 1 failed: 'utf-8' codec can't decode byte 0x84 in position 8...
Trying config 2: {'encoding': 'utf-8', 'sep': ';'}
Config 2 failed: 'utf-8' codec can't decode byte 0x84 in position 1...
Trying config 3: {'encoding': 'latin-1', 'sep': ','}
✅ CSV loaded successfully with config 3: (2823, 25)
🔄 Cleaning data...
📊 Final dataset shape: (2823, 25)
📋 Columns: ['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing file: /tmp/gradio/e1dfc3e1f9f864339749d73d86f43c45bde46dbe35f2c47385415bc9bcab4f0d/sales_data_sample copy.csv, Extension: csv
🔄 Attempting to load CSV file...
Trying config 1: {'encoding': 'utf-8', 'sep': ','}
Config 1 failed: 'utf-8' codec can't decode byte 0x84 in position 8...
Trying config 2: {'encoding': 'utf-8', 'sep': ';'}
Config 2 failed: 'utf-8' codec can't decode byte 0x84 in position 1...
Trying config 3: {'encoding': 'latin-1', 'sep': ','}
✅ CSV loaded successfully with config 3: (2823, 25)
🔄 Cleaning data...
📊 Final dataset shape: (2823, 25)
📋 Columns: ['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing file: /tmp/gradio/e1dfc3e1f9f864339749d73d86f43c45bde46dbe35f2c47385415bc9bcab4f0d/sales_data_sample copy.csv, Extension: csv
🔄 Attempting to load CSV file...
Trying config 1: {'encoding': 'utf-8', 'sep': ','}
Config 1 failed: 'utf-8' codec can't decode byte 0x84 in position 8...
Trying config 2: {'encoding': 'utf-8', 'sep': ';'}
Config 2 failed: 'utf-8' codec can't decode byte 0x84 in position 1...
Trying config 3: {'encoding': 'latin-1', 'sep': ','}
✅ CSV loaded successfully with config 3: (2823, 25)
🔄 Cleaning data...
📊 Final dataset shape: (2823, 25)
📋 Columns: ['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processing file: /tmp/gradio/e1dfc3e1f9f864339749d73d86f43c45bde46dbe35f2c47385415bc9bcab4f0d/sales_data_sample copy.csv, Extension: csv
🔄 Attempting to load CSV file...
Trying config 1: {'encoding': 'utf-8', 'sep': ','}
Config 1 failed: 'utf-8' codec can't decode byte 0x84 in position 8...
Trying config 2: {'encoding': 'utf-8', 'sep': ';'}
Config 2 failed: 'utf-8' codec can't decode byte 0x84 in position 1...
Trying config 3: {'encoding': 'latin-1', 'sep': ','}
✅ CSV loaded successfully with config 3: (2823, 25)
🔄 Cleaning data...
📊 Final dataset shape: (2823, 25)
📋 Columns: ['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:786