### Install Modules

In [None]:
# !pip install langchain
# !pip install langchain-experimental
# !pip install langchain_google_genai

### Import Modules

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_experimental.agents import create_pandas_dataframe_agent

### Utility Functions

In [41]:
def parse_response(response):
    steps = []
    for step in response['intermediate_steps']:
        if len(step) > 2:
            print('MULTI-STEP IDENTIFIED... PLEASE UPDATE THIS FUNCTION...')
        if step[0].tool == 'python_repl_ast':
            data = {
                'code': step[0].tool_input,
                'output': step[1],
                'reasoning': step[0].log
            }
            steps.append(data)
    return steps

### Initialize Pandas Dataframe Agent

In [None]:
# Initialize the Gemini model
GENAI_API_KEY = "AIzaSyC_UvSH-abkbWWgtWzhex58GqjEwfjq2ok"
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    api_key=GENAI_API_KEY
)

df = pd.read_csv(
    "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ZNoKMJ9rssJn-QbJ49kOzA/student-mat.csv"
)

# Create the agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=False,
    return_intermediate_steps=True,  # Enables returning model-generated code for chart generation
    allow_dangerous_code=True,
    max_iterations=5,
    verbose=True
)

### Run Agent

In [40]:
response = agent.invoke('Clean the dataset and print its head.')

steps = parse_response(response)

display(steps)

[{'code': '```python\nimport pandas as pd\ndf.isnull().sum()\n```',
  'output': school        0
  sex           0
  age           0
  address       0
  famsize       0
  Pstatus       0
  Medu          0
  Fedu          0
  Mjob          0
  Fjob          0
  reason        0
  guardian      0
  traveltime    0
  studytime     0
  failures      0
  schoolsup     0
  famsup        0
  paid          0
  activities    0
  nursery       0
  higher        0
  internet      0
  romantic      0
  famrel        0
  freetime      0
  goout         0
  Dalc          0
  Walc          0
  health        0
  absences      0
  G1            0
  G2            0
  G3            0
  dtype: int64,
  'reasoning': "Thought: Cleaning the dataset typically involves handling missing values and potentially converting categorical variables. Let's first check for missing values.\n\nAction: python_repl_ast\nAction Input: \n```python\nimport pandas as pd\ndf.isnull().sum()\n```"},
 {'code': '```python\ndf.dtypes\n

---

### Ask the model about the dataframe

In [9]:
# !pip install pandas google-generativeai

import pandas as pd
import google.generativeai as genai

# Load dataset
df = pd.read_csv(
    "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ZNoKMJ9rssJn-QbJ49kOzA/student-mat.csv"
)

# Configure Gemini API
GENAI_API_KEY = "AIzaSyC_UvSH-abkbWWgtWzhex58GqjEwfjq2ok"
genai.configure(api_key=GENAI_API_KEY)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-1.5-pro")

def generate_response(prompt):
    response = model.generate_content(prompt)
    return response.text

# Function to create a dataframe agent
def query_dataframe(prompt, df):
    full_prompt = f"""You are working with the following dataset:
    {df.head().to_string()}
    
    Based on this dataset, answer the following question:
    {prompt}
    """
    return generate_response(full_prompt)

# Example usage
question = "What is the average grade of students?"
response = query_dataframe(question, df)
print(response)

The provided dataset includes columns for grades G1, G2, and G3, presumably representing grades at different points in the year.  To calculate the "average grade," we need to decide which grade(s) to use.  Here's how to calculate a few different averages:

1. **Average of G3 (likely the final grade):**  Sum all the values in the G3 column and divide by the number of students. This is likely the most meaningful overall average grade.

2. **Average of G1, G2, and G3 for each student, then averaged across students:** Calculate the average of G1, G2, and G3 for each row (student). Then, average those individual student averages. This gives an average performance across the entire year.

3. **Average of G1, G2, and G3 across all students:**  Sum the values in G1, G2, and G3 *separately*. Then divide each sum by the total number of students. This gives the average for each grading period.

Here's Python code to perform these calculations using pandas:

```python
import pandas as pd

data = {

In [None]:
# Load dataset
df = pd.read_csv(
    "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ZNoKMJ9rssJn-QbJ49kOzA/student-mat.csv"
)

# Configure Gemini API
GENAI_API_KEY = "AIzaSyC_UvSH-abkbWWgtWzhex58GqjEwfjq2ok"
genai.configure(api_key=GENAI_API_KEY)

# Initialize the Gemini model
model = genai.GenerativeModel("gemini-pro")

def generate_response(prompt):
    response = model.generate_content(prompt)
    return response.text

# Function to create a dataframe agent
def query_dataframe(prompt, df):
    full_prompt = f"""You are working with the following dataset:
    {df.head().to_string()}
    
    Based on this dataset, answer the following question:
    {prompt}
    """
    return generate_response(full_prompt)

# Example usage
question = "What is the average grade of students?"
response = query_dataframe(question, df)
print(response)