<a href="https://colab.research.google.com/github/Heresjohnnyi/Edius-Pro/blob/main/chronosage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install streamlit pandas plotly scikit-learn numpy pyngrok fuzzywuzzy

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [5]:
%%writefile chrono_sage.py
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import re
import io
import time
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')

# Streamlit configuration
st.set_page_config(page_title="Chrono Sage", layout="wide")
st.markdown("""
    <style>
        .main { background-color: #f5f7fa; }
        .block-container { padding: 2rem; }
        .stTextInput { background-color: #ffffff; border-radius: 5px; padding: 0.5em; }
        .stButton { background-color: #4CAF50; color: white; }
        .stSelectbox { background-color: #ffffff; }
    </style>
""", unsafe_allow_html=True)

st.title("🧠 Chrono Sage - Time Series and Causal Inference engine")
st.markdown("**Explore your data with a chat-like interface, dynamic controls, and interactive plots! Upload a CSV and ask away.**")

# Initialize session state
if 'chat_history' not in st.session_state:
    st.session_state.chat_history = []
if 'df_clean' not in st.session_state:
    st.session_state.df_clean = None

# Cache data loading and cleaning
@st.cache_data
def load_and_clean_data(file):
    start_time = time.time()
    try:
        df = pd.read_csv(file)
    except Exception as e:
        st.error(f"Error reading CSV: {e}")
        return None, 0
    df_clean = df.copy()
    for col in df_clean.columns:
        if df_clean[col].dtype in ['float64', 'int64']:
            imputer = SimpleImputer(strategy='median')
            df_clean[col] = imputer.fit_transform(df_clean[[col]].values.reshape(-1, 1)).ravel()
        else:
            df_clean[col] = df_clean[col].fillna('Unknown')
    elapsed_time = time.time() - start_time
    return df_clean, elapsed_time

# Helper functions
def fuzzy_match_column(column, columns, threshold=80):
    for col in columns:
        if fuzz.partial_ratio(column.lower(), col.lower()) >= threshold:
            return col
    return None

def extract_columns_from_prompt(prompt, columns):
    words = prompt.lower().split()
    matched_columns = []
    for word in words:
        matched_col = fuzzy_match_column(word, columns)
        if matched_col and matched_col not in matched_columns:
            matched_columns.append(matched_col)
    return matched_columns

def ineligible_error(function):
    return f"The dataset you provided is ineligible or not fit for performing {function}"

# Analysis functions
def handle_summary(df):
    start_time = time.time()
    if df.empty:
        return ineligible_error("summary"), 0
    st.subheader("📊 Summary Statistics")
    st.dataframe(df.describe(include='all'))
    elapsed_time = time.time() - start_time
    return True, elapsed_time

def handle_missing_values(df, column=None):
    start_time = time.time()
    if df.empty:
        st.subheader("🩶� Missing Value Report")
        if column:
            st.write(f"Missing values in {column}: {df[column].isnull().sum()}")
        else:
            st.write(df.isnull().sum())
    elapsed_time = time.time() - start_time
    return True, elapsed_time

def handle_simulation(df, col=None, action="increase", value=10.0, change_type="percent", years=1, filters=None):
    start_time = time.time()
    valid_simulation = False
    df_sim = df.copy()

    if not col or col not in df.columns:
        st.error(f"Column '{col}' not found. Available: {list(df.columns)}")
        return ineligible_error("simulation"), 0
    if df_sim[col].dtype not in ['float64', 'int64']:
        st.warning(f"Column '{col}' must be numeric.")
        return ineligible_error("simulation"), 0

    temp_df = df_sim.copy()
    if filters:
        for filter_col, filter_val in filters.items():
            if filter_col not in df.columns:
                st.warning(f"Filter column '{filter_col}' not found.")
                continue
            try:
                filter_val = float(filter_val) if df[filter_col].dtype in ['float64', 'int64'] else filter_val
                temp_df = temp_df[temp_df[filter_col] == filter_val]
            except ValueError:
                st.warning(f"Invalid filter value '{filter_val}' for '{filter_col}'.")
                continue
        if temp_df.empty:
            st.warning("No data matches filter conditions.")
            return ineligible_error("simulation"), time.time() - start_time
        df_sim = temp_df

    for _ in range(years):
        if change_type == "percent":
            factor = (1 + value / 100) if action == "increase" else (1 - value / 100)
            df_sim[col] = df_sim[col] * factor
        else:
            df_sim[col] = df_sim[col] + value if action == "increase" else df_sim[col] - value

    st.write(f"Simulated '{col}' {action}d by {value} {change_type} {'annually for ' + str(years) + ' years' if years > 1 else ''}")
    st.dataframe(df_sim[[col]].head(10))
    fig = px.histogram(df_sim, x=col, title=f"{col} after {action} by {value} {change_type}", hover_data=df_sim.columns)
    st.plotly_chart(fig, use_container_width=True)
    valid_simulation = True

    elapsed_time = time.time() - start_time
    if not valid_simulation:
        return ineligible_error("what-if simulation"), elapsed_time
    return True, elapsed_time

def handle_causal_analysis(df, cause=None, effect=None, confounder=None):
    start_time = time.time()
    st.warning("Correlation does not imply causation. Results assume no unobserved confounders.")
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_cols) > 1:
        fig = px.imshow(df[numeric_cols].corr(), text_auto=True, title="Correlation Matrix", color_continuous_scale='Viridis')
        st.plotly_chart(fig, use_container_width=True)

    if not cause or not effect or cause not in df.columns or effect not in df.columns:
        st.warning(f"Invalid columns: '{cause}' or '{effect}'. Available: {list(df.columns)}")
        return ineligible_error("causal analysis"), time.time() - start_time
    if df[cause].dtype not in ['float64', 'int64'] or df[effect].dtype not in ['float64', 'int64']:
        st.warning(f"'{cause}' and '{effect}' must be numeric.")
        return ineligible_error("causal analysis"), time.time() - start_time

    if confounder and confounder in df.columns:
        st.write(f"Propensity Score Matching: Effect of {cause} on {effect} controlling for {confounder}")
        X_conf = df[[confounder]].values if df[confounder].dtype in ['float64', 'int64'] else pd.get_dummies(df[confounder], drop_first=True).values
        scaler = StandardScaler()
        X_conf_scaled = scaler.fit_transform(X_conf)

        median_cause = df[cause].median()
        df['treatment'] = (df[cause] > median_cause).astype(int)
        treated = df[df['treatment'] == 1]
        control = df[df['treatment'] == 0]

        if len(treated) < 2 or len(control) < 2:
            st.warning(f"Insufficient data: treated={len(treated)}, control={len(control)}")
            return ineligible_error("propensity score matching"), time.time() - start_time

        with st.spinner("Running PSM..."):
            nn = NearestNeighbors(n_neighbors=1, n_jobs=-1)
            nn.fit(X_conf_scaled[control.index])
            distances, indices = nn.kneighbors(X_conf_scaled[treated.index])
            matched_control = control.iloc[indices.flatten()]

        ate = treated[effect].mean() - matched_control[effect].mean()
        st.write(f"ATE of {cause} on {effect}: {ate:.4f}")
        fig = px.box(pd.concat([treated[[effect]].assign(Group='Treated'), matched_control[[effect]].assign(Group='Control')]),
                     x='Group', y=effect, title=f"{effect}: Treated vs Matched Control", hover_data=[effect])
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.write(f"Linear Regression: {effect} ~ {cause}")
        X = df[[cause]].values
        y = df[effect].values
        model = LinearRegression()
        model.fit(X, y)
        st.write(f"Coefficient: {model.coef_[0]:.4f}, Intercept: {model.intercept_:.4f}")
        fig = px.scatter(df, x=cause, y=effect, trendline="ols", title=f"Regression: {effect} vs {cause}", hover_data=df.columns)
        st.plotly_chart(fig, use_container_width=True)

    elapsed_time = time.time() - start_time
    return True, elapsed_time

def handle_plot(df, x_col=None, y_col=None):
    start_time = time.time()
    if not x_col or not y_col or x_col not in df.columns or y_col not in df.columns:
        st.warning(f"Columns '{x_col}' or '{y_col}' not found. Available: {list(df.columns)}")
        return ineligible_error("plotting"), time.time() - start_time

    st.subheader(f"📈 {y_col} vs {x_col}")
    if df[x_col].dtype in ['float64', 'int64'] and df[y_col].dtype in ['float64', 'int64']:
        fig = px.scatter(df, x=x_col, y=y_col, title=f"{y_col} vs {x_col}", hover_data=df.columns)
    else:
        fig = px.box(df, x=x_col, y=y_col, title=f"{y_col} vs {x_col}", hover_data=df.columns)
    st.plotly_chart(fig, use_container_width=True)
    elapsed_time = time.time() - start_time
    return True, elapsed_time

# Sidebar for analysis selection
st.sidebar.header("🛠️ Analysis Options")
analysis_type = st.sidebar.selectbox(
    "Choose Analysis Type",
    ["Summary", "Missing Values", "What-If Simulation", "Causal Analysis", "Plot"],
    help="Select the type of analysis to perform on your data."
)

# File uploader
uploaded_file = st.file_uploader("📁 Upload CSV", type="csv", help="Upload a CSV file to analyze. Try 'chrono_sage_friendly.csv'!")

if uploaded_file:
    with st.spinner("Loading data..."):
        progress_bar = st.progress(0)
        st.session_state.df_clean, load_time = load_and_clean_data(uploaded_file)
        progress_bar.progress(100)
    if st.session_state.df_clean is None:
        st.stop()

    df_clean = st.session_state.df_clean
    st.write(f"Data loading and cleaning took {load_time:.2f} seconds")
    with st.expander("🧾 Preview Data", expanded=False):
        st.dataframe(df_clean.head(10))

    buffer = io.StringIO()
    df_clean.to_csv(buffer, index=False)
    st.download_button("Download Cleaned Data", buffer.getvalue(), "cleaned_data.csv", help="Download the cleaned dataset.")

    columns = df_clean.columns.tolist()
    numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

    # Dynamic inputs based on analysis type
    if analysis_type == "Summary":
        if st.button("Run Summary", help="Generate summary statistics for all columns."):
            with st.spinner("Generating summary..."):
                success, elapsed_time = handle_summary(df_clean)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                st.session_state.chat_history.append(("Summary", "Summary statistics generated."))

    elif analysis_type == "Missing Values":
        col = st.selectbox("Select Column (Optional)", ["All Columns"] + columns, help="Choose a specific column or all for missing value report.")
        if st.button("Check Missing Values", help="Show missing value counts."):
            with st.spinner("Checking missing values..."):
                success, elapsed_time = handle_missing_values(df_clean, col if col != "All Columns" else None)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                st.session_state.chat_history.append(("Missing Values", f"Checked missing values for {col}."))

    elif analysis_type == "What-If Simulation":
        st.subheader("What-If Simulation")
        col = st.selectbox("Select Column to Modify", numeric_cols, help="Choose a numeric column to simulate changes (e.g., discounted_price).")
        action = st.radio("Action", ["Increase", "Decrease"], help="Increase or decrease the column values.")
        change_type = st.radio("Change Type", ["Percent", "Fixed"], help="Apply change as percentage or fixed amount.")
        value = st.slider("Change Value", 0.0, 100.0, 10.0, help="Amount to change (e.g., 10 for 10% or 10 units).")
        years = st.slider("Years (for Annual Change)", 1, 5, 1, help="Number of years for annual compounding.")
        filter_cols = st.multiselect("Filter Columns", columns, help="Select columns to filter data (e.g., category, region).")
        filters = {}
        for filter_col in filter_cols:
            unique_vals = df_clean[filter_col].unique().tolist()
            filter_val = st.selectbox(f"Value for {filter_col}", unique_vals, key=filter_col)
            filters[filter_col] = filter_val

        if st.button("Run Simulation", help="Run the what-if simulation with selected parameters."):
            with st.spinner("Running simulation..."):
                progress_bar = st.progress(0)
                success, elapsed_time = handle_simulation(df_clean, col, action.lower(), value, change_type.lower(), years, filters)
                progress_bar.progress(100)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                st.session_state.chat_history.append(("What-If", f"Simulated {action.lower()} {col} by {value} {change_type.lower()}."))

    elif analysis_type == "Causal Analysis":
        st.subheader("Causal Analysis")
        cause = st.selectbox("Cause Column", numeric_cols, help="Select the cause variable (e.g., discounted_price).")
        effect = st.selectbox("Effect Column", numeric_cols, help="Select the effect variable (e.g., rating).")
        confounder = st.selectbox("Confounder Column (Optional)", ["None"] + columns, help="Choose a confounder (e.g., category).")
        if st.button("Run Causal Analysis", help="Estimate causal effect with PSM or regression."):
            with st.spinner("Running causal analysis..."):
                progress_bar = st.progress(0)
                success, elapsed_time = handle_causal_analysis(df_clean, cause, effect, None if confounder == "None" else confounder)
                progress_bar.progress(100)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                st.session_state.chat_history.append(("Causal", f"Analyzed effect of {cause} on {effect}."))

    elif analysis_type == "Plot":
        st.subheader("Plot Data")
        x_col = st.selectbox("X-Axis Column", columns, help="Choose column for X-axis (e.g., discounted_price).")
        y_col = st.selectbox("Y-Axis Column", columns, help="Choose column for Y-axis (e.g., rating).")
        if st.button("Generate Plot", help="Create an interactive scatter or box plot."):
            with st.spinner("Generating plot..."):
                success, elapsed_time = handle_plot(df_clean, x_col, y_col)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                st.session_state.chat_history.append(("Plot", f"Plotted {y_col} vs {x_col}."))

    # Chat-like interface
    st.subheader("💬 Ask a Question")
    prompt_suggestions = [
        "Summarize data",
        "Show missing values",
        "What if discounted_price increased by 10 percent where category = Electronics",
        "Causal effect of price on sales with region as confounder",
        "Plot discounted_price vs rating"
    ]
    prompt = st.selectbox("Try a Prompt", ["Type your own..."] + prompt_suggestions, help="Select or type a query.")
    if prompt == "Type your own...":
        prompt = st.text_input("Enter your question", placeholder="e.g., What if sales increased by 10% where region = West", help="Ask anything about your data!")

    if st.button("Submit Query", help="Run the custom query."):
        if prompt and prompt != "Type your own...":
            st.session_state.chat_history.append(("User", prompt))
            prompt_lower = prompt.lower()
            with st.spinner("Processing query..."):
                progress_bar = st.progress(0)
                success = False
                elapsed_time = 0

                if any(keyword in prompt_lower for keyword in ['summary', 'describe']):
                    success, elapsed_time = handle_summary(df_clean)
                elif any(keyword in prompt_lower for keyword in ['missing', 'null']):
                    success, elapsed_time = handle_missing_values(df_clean)
                elif 'what if' in prompt_lower:
                    # Parse prompt manually for demo; use sidebar for structured input
                    pattern = r'(?P<column>\w+)\s*(?P<action>increased|decreased)\s*by\s*(?P<value>\d+\.?\d*)\s*(?P<type>percent|fixed)(?:\s*annually\s*for\s*(?P<years>\d+)\s*years)?(?:\s*(?P<filter>where\s+[\w\s=]+))?'
                    match = re.search(pattern, prompt_lower)
                    if match:
                        try:
                            groups = match.groupdict()
                            col = fuzzy_match_column(groups['column'], columns)
                            action = groups['action']
                            value = float(groups['value'])
                            change_type = groups['type']
                            years = int(groups['years']) if groups['years'] else 1
                            filters = {}
                            if groups['filter']:
                                conditions = re.findall(r'(\w+)\s*=\s*([^ ]+)', groups['filter'])
                                for filter_col, filter_val in conditions:
                                    filter_col = fuzzy_match_column(filter_col, columns)
                                    if filter_col:
                                        filters[filter_col] = filter_val
                            success, elapsed_time = handle_simulation(df_clean, col, action, value, change_type, years, filters)
                        except ValueError:
                            st.error("Invalid prompt format. Try: 'What if sales increased by 10% where region = West'")
                    else:
                        st.error(ineligible_error("what-if simulation"))
                elif any(keyword in prompt_lower for keyword in ['causal', 'correlation', 'effect of']):
                    match = re.search(r'effect of\s+(\w+)\s+on\s+(\w+)(\s+with\s+(\w+)\s+as\s+confounder)?', prompt_lower)
                    if match:
                        cause = fuzzy_match_column(match.group(1), columns)
                        effect = fuzzy_match_column(match.group(2), columns)
                        confounder = fuzzy_match_column(match.group(4), columns) if match.group(4) else None
                        success, elapsed_time = handle_causal_analysis(df_clean, cause, effect, confounder)
                    else:
                        st.error(ineligible_error("causal analysis"))
                elif any(keyword in prompt_lower for keyword in ['plot', 'chart', 'graph']):
                    selected = extract_columns_from_prompt(prompt, columns)
                    if len(selected) >= 2:
                        success, elapsed_time = handle_plot(df_clean, selected[0], selected[1])
                    else:
                        st.error("Need two columns for plotting. Try: 'plot discounted_price vs rating'")
                else:
                    st.error("Unknown request. Try suggested prompts or use the sidebar.")

                progress_bar.progress(100)
                st.write(f"Analysis took {elapsed_time:.2f} seconds")
                if success:
                    st.session_state.chat_history.append(("Chrono Sage", "Analysis completed."))

    # Display chat history
    with st.expander("💬 Chat History", expanded=True):
        for sender, message in st.session_state.chat_history[-5:]:  # Show last 5 messages
            st.write(f"**{sender}:** {message}")

else:
    st.info("Upload a CSV to start exploring! Try 'chrono_sage_friendly.csv' for a demo.")

Overwriting chrono_sage.py


In [6]:
from pyngrok import ngrok
!ngrok authtoken 2y7RjHikhB7yVj6BRRuXaG8MD4D_6TPAznn1x9he6NB5dJh6u
get_ipython().system_raw("streamlit run chrono_sage.py &")
public_url = ngrok.connect(8501)
print(f"Streamlit app is running at: {public_url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app is running at: NgrokTunnel: "https://8243-34-126-161-203.ngrok-free.app" -> "http://localhost:8501"
