In [6]:
import pandas as pd
import os
# import boto3
from dotenv import load_dotenv
import io
import gradio as gr
from langchain_openai import ChatOpenAI
# from langchain_community.chat_models import BedrockChat, OllamaChat
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from pprint import pprint
from contextlib import redirect_stdout



In [10]:
load_dotenv(override=True)

False

In [13]:
dotenv_path = 'C:\data-scientist-ai-agent\.env'

  dotenv_path = 'C:\data-scientist-ai-agent\.env'


In [14]:
is_loaded = load_dotenv(dotenv_path=dotenv_path, override=True)
print(f"Was the .env file loaded? -> {is_loaded}")

Was the .env file loaded? -> False


In [4]:
print(os.getenv("OPENAI_API_KEY"))
print(os.getcwd())

None
c:\data-scientist-ai-agent


In [None]:
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
aws_session_token=os.getenv("AWS_SESSION_TOKEN")    

In [None]:
# llm = ChatBedrock(region_name="us-east-1", model_id="anthropic.claude-3-5-sonnet-20240620-v1:0", model_kwargs={"temperature": 0.0})
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)

In [None]:
df = pd.read_csv('train.csv')
df

In [None]:
from langchain.agents import tool

Get Data Tool

In [None]:
@tool
def get_dataframe_info() -> str:
    """
    Returns a full summary of the DataFrame including column names, non-null counts, and data types.
    Use this first to get a general understanding of the dataset's structure.
    """
    global df
    buffer = io.StringIO()
    df.info(buf=buffer)
    return buffer.getvalue()

@tool
def get_dataframe_head(n: int = 5) -> str:
    """
    Returns the first n rows of the DataFrame as a string.
    Useful for inspecting the actual data values. 'n' defaults to 5.
    """
    global df
    return df.head(n).to_string()

@tool
def get_descriptive_stats() -> str:
    """
    Returns descriptive statistics for the numerical columns in the DataFrame, 
    including count, mean, std, min, max, and percentiles.
    """
    global df
    return df.describe().to_string()

@tool
def get_value_counts(column: str) -> str:
    """
    Returns the count of unique values for a specified column.
    Best used for categorical columns like 'Sex', 'Pclass', or 'Embarked'.
    """
    global df
    if column not in df.columns:
        return f"Error: Column '{column}' not found in the DataFrame."
    return df[column].value_counts().to_string()


Edit Data Tool

In [None]:
@tool
def drop_columns(columns: list[str]) -> str:
    """
    Drops one or more specified columns from the DataFrame.
    The input should be a list of column name strings.
    """
    global df
    try:
        df.drop(columns=columns, axis=1, inplace=True)
        return f"Successfully dropped columns: {columns}. Use get_dataframe_info to see the new structure."
    except KeyError as e:
        return f"Error: One or more columns not found: {e}"
    except Exception as e:
        return f"An error occurred: {e}"

@tool
def fill_missing_age_with_median() -> str:
    """
    Calculates the median of the 'Age' column and fills any missing values in that column with the median.
    This should only be used for the 'Age' column.
    """
    global df
    if 'Age' not in df.columns:
        return "Error: 'Age' column not found."
    
    median_age = df['Age'].median()
    df['Age'].fillna(median_age, inplace=True)
    return f"Successfully filled missing 'Age' values with the median value of {median_age}."

@tool
def create_familysize_feature() -> str:
    """
    Creates a new column 'FamilySize' by adding the 'SibSp' and 'Parch' columns together.
    """
    global df
    if 'SibSp' in df.columns and 'Parch' in df.columns:
        df['FamilySize'] = df['SibSp'] + df['Parch']
        return "Successfully created the 'FamilySize' feature."
    else:
        return "Error: 'SibSp' or 'Parch' columns not found."

In [None]:
tools = [
    get_dataframe_info,
    get_dataframe_head,
    get_descriptive_stats,
    get_value_counts,
    drop_columns,
    fill_missing_age_with_median,
    create_familysize_feature,
]

In [None]:
system_prompt = """
You are an expert Data Scientist assistant. Your goal is to help the user with Exploratory Data Analysis (EDA) and Feature Engineering.

You have access to a set of tools to inspect and manipulate a pandas DataFrame.

**Thinking Process:**
1.  **Understand the Goal:** Analyze the user's request.
2.  **Inspect:** ALWAYS use your inspection tools (`get_dataframe_info`, `get_dataframe_head`, etc.) to understand the current state of the DataFrame before taking any action. Do not assume the state.
3.  **Plan:** Decide which tool is appropriate for the user's request.
4.  **Act:** Execute the chosen tool.
5.  **Observe & Respond:** Analyze the output from the tool and provide a clear, helpful summary to the user.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

In [None]:
prompt

In [None]:
agent =  create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [None]:
def chat(message, history):
  chat_history = []
  for human, assistant in history:
    chat_history.append(HumanMessage(content=human))
    chat_history.append(AIMessage(content=assistant))

  response = agent_executor.invoke({
    "input":message,
    "chat_history":chat_history
  })
  return response["output"]

In [None]:
gr.ChatInterface(chat, type="messages").launch()

In [None]:
def initialize_bedrock_client(
    aws_access_key_id, aws_secret_access_key, aws_session_token
):
    return boto3.client(
        "bedrock-runtime",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token,
        region_name=REGION_NAME,
        # config=custom_config,
    )
 
REGION_NAME = "us-east-1"
MODEL_ID = "anthropic.claude-3-5-sonnet-20240620-v1:0"
 
# Setting AWS key
# AWS_ACCESS_KEY_ID = ""
# AWS_SECRET_ACCESS_KEY = ""
# AWS_SESSION_TOKEN = ""
 
# custom_config = Config(
#     read_timeout=400,
#     connect_timeout=400,
#     retries={"max_attempts": 3},
# )
bedrock_runtime = initialize_bedrock_client(
        aws_access_key_id, aws_secret_access_key, aws_session_token
    )
llm = BedrockChat(model_id=MODEL_ID, client=bedrock_runtime)
 
system_template = """You are a female who is specialized in financial analysis and credit analysis for auto loans. Your task is to analyze financial data and provide insights."""
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
human_template_company_fin = """Analyze the following data {company_profile}."""
 
human_message_prompt_company_fin = HumanMessagePromptTemplate.from_template(
        human_template_company_fin
    )
chat_prompt_comp_info = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt_company_fin]
    )
messages_comp_info = chat_prompt_comp_info.format_prompt(
        company_profile="The company is a leading auto loan provider with a diverse portfolio of customers. The company has been in operation for over 10 years and has a strong market presence. The company offers competitive interest rates and flexible repayment options to its customers. The company has a robust risk management framework in place to mitigate potential losses."
    ).to_messages()
 
response = llm.invoke(
        messages_comp_info, temperature=0.0, max_tokens=4096, top_p=0.9999
    )
response.content