In [1]:
from langchain.tools import tool
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import polars as pl
from datetime import datetime

# Import the full email DataFrame
from lib.load_data import df

# Import helper functions
from lib.utils import normalize_list, match_value_in_columns

Command-line environment detected. Using local data file.
Loading email metadata from: C:\Users\venka\OneDrive\Desktop\auto_researcher\7.Real_estate_assistant\real_estate_assistant\lib\data\full_mails.jsonl
Successfully loaded 11688 records for metadata.
Connecting to ChromaDB Vector Store...
Successfully connected to ChromaDB collection.


In [2]:
def classify_sentiment(score):
    """Classifies a VADER compound score into a category."""
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [3]:
@tool
def sentiment_analysis_tool(
    sender: str = None,
    recipient: str = None,
    start_date: str = None,
    end_date: str = None,
    timeline_granularity: str = None
) -> str:
    """
    Analyzes the sentiment of emails based on filters like sender, recipient, or date range.
    It can provide an overall summary or a timeline of sentiment.
    Use this for high-level questions like "what is the overall sentiment in customer emails?" or "show me the sentiment timeline for emails from Raja."
    """
    print(f"sentiment_analysis_tool called with: sender={sender}, start_date={start_date}, end_date={end_date}, timeline={timeline_granularity}")
    
    if df.is_empty():
        return "Error: Email data is not loaded."

    temp_df = df.clone()

    # --- 1. Apply Filters Sequentially ---
    if sender:
        sender_lower = sender.lower()
        temp_df = temp_df.with_columns(
            pl.col("from").map_elements(normalize_list, return_dtype=str).alias("from_normalized")
        ).filter(
            pl.col("from_normalized").map_elements(lambda x: match_value_in_columns(sender_lower, x), return_dtype=bool)
        )

    if start_date or end_date or timeline_granularity:
        temp_df = temp_df.with_columns(
            pl.col("date").str.to_datetime("%Y-%m-%dT%H:%M:%SZ", strict=False).alias("date_dt")
        )
        if start_date:
            temp_df = temp_df.filter(pl.col("date_dt") >= datetime.strptime(start_date, "%Y-%m-%d"))
        if end_date:
            temp_df = temp_df.filter(pl.col("date_dt") <= datetime.strptime(end_date, "%Y-%m-%d"))

    filtered_df = temp_df
    print(f"Filtered DataFrame has {filtered_df.height} rows after applying filters.")

    if filtered_df.is_empty():
        return "No emails found for the specified criteria."

    # --- 2. Map & Analyze ---
    # THE DEFINITIVE FIX: Access the struct field by its position (index 0) to be robust.
    safe_text_extraction_expr = (
        pl.when(pl.col("body").is_not_null())
        .then(pl.col("body").struct.field(0)) # Access by index for robustness
        .otherwise(pl.lit(""))
    )
    print(f"Using safe text extraction expression: {safe_text_extraction_expr}")

    select_exprs = [
        safe_text_extraction_expr.map_elements(
            lambda text: analyzer.polarity_scores(str(text or ""))['compound'],
            return_dtype=pl.Float64
        ).alias("sentiment_score")
    ]
    print(f"Select expressions for sentiment analysis: {select_exprs}")
    if 'date_dt' in filtered_df.columns:
        select_exprs.append(pl.col("date_dt").alias("date"))

    sentiments = filtered_df.select(select_exprs).drop_nulls(subset=["sentiment_score"])
    print(f"Sentiment analysis results: {sentiments}")

    if sentiments.is_empty():
        return "Found emails, but could not extract text bodies to analyze sentiment."

    # --- 3. Reduce & Synthesize ---
    granularity = None
    if timeline_granularity:
        if "month" in timeline_granularity.lower():
            granularity = "month"
        elif "year" in timeline_granularity.lower():
            granularity = "year"

    if granularity in ["month", "year"]:
        if "date" not in sentiments.columns:
            return "Cannot create a timeline without date information."
            
        period = "1mo" if granularity == "month" else "1y"
        
        timeline_summary = sentiments.drop_nulls(subset=["date"]).sort("date").group_by_dynamic("date", every=period).agg(
            pl.mean("sentiment_score").alias("average_sentiment"),
            pl.count().alias("email_count")
        )
        
        if timeline_summary.is_empty():
            return "Found emails with valid dates, but could not generate a timeline summary."

        summary_lines = [f"Sentiment Timeline Analysis (granularity: {granularity}):"]
        for row in timeline_summary.to_dicts():
            period_str = row['date'].strftime('%Y-%m' if granularity == 'month' else '%Y')
            avg_sentiment = row['average_sentiment']
            sentiment_class = classify_sentiment(avg_sentiment)
            summary_lines.append(
                f"- Period: {period_str}, Email Count: {row['email_count']}, "
                f"Average Sentiment: {avg_sentiment:.2f} ({sentiment_class})"
            )
        return "\n".join(summary_lines)

    else:
        overall_summary = sentiments.select(
            pl.mean("sentiment_score").alias("average_sentiment"),
            pl.col("sentiment_score").map_elements(lambda s: classify_sentiment(s), return_dtype=str).value_counts().alias("sentiment_counts"),
            pl.count().alias("total_emails")
        ).to_dicts()[0]

        avg_score = overall_summary['average_sentiment']
        total_emails = overall_summary['total_emails']
        counts = {d['sentiment_score']: d['count'] for d in overall_summary['sentiment_counts']}
        
        summary = (
            f"Overall Sentiment Analysis Summary:\n"
            f"- Total Emails Analyzed: {total_emails}\n"
            f"- Average Sentiment Score: {avg_score:.2f} ({classify_sentiment(avg_score)})\n"
            f"- Positive Emails: {counts.get('Positive', 0)}\n"
            f"- Negative Emails: {counts.get('Negative', 0)}\n"
            f"- Neutral Emails: {counts.get('Neutral', 0)}"
        )
        return summary

In [6]:
@tool
def sentiment_analysis_tool(
    sender: str = None,
    recipient: str = None,
    start_date: str = None,
    end_date: str = None,
    timeline_granularity: str = None
) -> str:
    """
    Analyzes the sentiment of emails based on filters like sender, recipient, or date range.
    It can provide an overall summary or a timeline of sentiment.
    Use this for high-level questions like "what is the overall sentiment in customer emails?" or "show me the sentiment timeline for emails from Raja."
    """
    print(f"sentiment_analysis_tool called with: sender={sender}, start_date={start_date}, end_date={end_date}, timeline={timeline_granularity}")
    
    if df.is_empty():
        return "Error: Email data is not loaded."
    analyzer = SentimentIntensityAnalyzer()

    temp_df = df.clone()

    # --- 1. Apply Filters Sequentially ---
    if sender:
        sender_lower = sender.lower()
        temp_df = temp_df.with_columns(
            pl.col("from").map_elements(normalize_list, return_dtype=str).alias("from_normalized")
        ).filter(
            pl.col("from_normalized").map_elements(lambda x: match_value_in_columns(sender_lower, x), return_dtype=bool)
        )

    if start_date or end_date or timeline_granularity:
        temp_df = temp_df.with_columns(
            pl.col("date").str.to_datetime("%Y-%m-%dT%H:%M:%SZ", strict=False).alias("date_dt")
        )
        if start_date:
            temp_df = temp_df.filter(pl.col("date_dt") >= datetime.strptime(start_date, "%Y-%m-%d"))
        if end_date:
            temp_df = temp_df.filter(pl.col("date_dt") <= datetime.strptime(end_date, "%Y-%m-%d"))

    filtered_df = temp_df
    print(f"Filtered DataFrame has {filtered_df.height} rows after applying filters.")

    if filtered_df.is_empty():
        return "No emails found for the specified criteria."

    # --- 2. Map & Analyze ---
    # THE DEFINITIVE FIX: Use the field's string name 'text' as required by the function.
    safe_text_extraction_expr = (
        pl.when(pl.col("body").is_not_null())
        .then(pl.col("body").struct.field("text")) # Access by the name 'text'
        .otherwise(pl.lit(""))
    )

    select_exprs = [
        safe_text_extraction_expr.map_elements(
            lambda text: analyzer.polarity_scores(str(text or ""))['compound'],
            return_dtype=pl.Float64
        ).alias("sentiment_score")
    ]
    if 'date_dt' in filtered_df.columns:
        select_exprs.append(pl.col("date_dt").alias("date"))

    sentiments = filtered_df.select(select_exprs).drop_nulls(subset=["sentiment_score"])

    if sentiments.is_empty():
        return "Found emails, but could not extract text bodies to analyze sentiment."

    # --- 3. Reduce & Synthesize ---
    granularity = None
    if timeline_granularity:
        if "month" in timeline_granularity.lower():
            granularity = "month"
        elif "year" in timeline_granularity.lower():
            granularity = "year"

    if granularity in ["month", "year"]:
        if "date" not in sentiments.columns:
            return "Cannot create a timeline without date information."
            
        period = "1mo" if granularity == "month" else "1y"
        
        timeline_summary = sentiments.drop_nulls(subset=["date"]).sort("date").group_by_dynamic("date", every=period).agg(
            pl.mean("sentiment_score").alias("average_sentiment"),
            pl.count().alias("email_count")
        )
        
        if timeline_summary.is_empty():
            return "Found emails with valid dates, but could not generate a timeline summary."

        summary_lines = [f"Sentiment Timeline Analysis (granularity: {granularity}):"]
        for row in timeline_summary.to_dicts():
            period_str = row['date'].strftime('%Y-%m' if granularity == 'month' else '%Y')
            avg_sentiment = row['average_sentiment']
            sentiment_class = classify_sentiment(avg_sentiment)
            summary_lines.append(
                f"- Period: {period_str}, Email Count: {row['email_count']}, "
                f"Average Sentiment: {avg_sentiment:.2f} ({sentiment_class})"
            )
        return "\n".join(summary_lines)

    else:
        overall_summary = sentiments.select(
            pl.mean("sentiment_score").alias("average_sentiment"),
            pl.col("sentiment_score").map_elements(lambda s: classify_sentiment(s), return_dtype=str).value_counts().alias("sentiment_counts"),
            pl.count().alias("total_emails")
        ).to_dicts()[0]

        avg_score = overall_summary['average_sentiment']
        total_emails = overall_summary['total_emails']
        counts = {d['sentiment_score']: d['count'] for d in overall_summary['sentiment_counts']}
        
        summary = (
            f"Overall Sentiment Analysis Summary:\n"
            f"- Total Emails Analyzed: {total_emails}\n"
            f"- Average Sentiment Score: {avg_score:.2f} ({classify_sentiment(avg_score)})\n"
            f"- Positive Emails: {counts.get('Positive', 0)}\n"
            f"- Negative Emails: {counts.get('Negative', 0)}\n"
            f"- Neutral Emails: {counts.get('Neutral', 0)}"
        )
        return summary

In [7]:
tool_input = {
    "sender": "Raja",
    "start_date": "2025-04-08",
    "end_date": "2025-09-08",
    "timeline_granularity": "month"  # Corrected argument name
}

# Use the .invoke() method to run the tool
print(sentiment_analysis_tool.invoke(tool_input))

sentiment_analysis_tool called with: sender=Raja, start_date=2025-04-08, end_date=2025-09-08, timeline=month
Filtered DataFrame has 2 rows after applying filters.


NameError: name 'analyzer' is not defined