In [1]:
# Important libraries
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain.agents import AgentType
from langchain_openai import ChatOpenAI

import pandas as pd

In [2]:
# Custom date parser function
df_movie = pd.read_csv("movies_meta.csv")
df_movie

Unnamed: 0,original_title,release_date,collection,genres
0,Toy Story,1995-10-30,Toy Story Collection,"['Animation', 'Comedy', 'Family']"
1,Jumanji,1995-12-15,,"['Adventure', 'Fantasy', 'Family']"
2,Grumpier Old Men,1995-12-22,Grumpy Old Men Collection,"['Romance', 'Comedy']"
3,Waiting to Exhale,1995-12-22,,"['Comedy', 'Drama', 'Romance']"
4,Father of the Bride Part II,1995-02-10,Father of the Bride Collection,['Comedy']
...,...,...,...,...
45461,رگ خواب,,,"['Drama', 'Family']"
45462,Siglo ng Pagluluwal,2011-11-17,,['Drama']
45463,Betrayal,2003-08-01,,"['Action', 'Drama', 'Thriller']"
45464,Satana likuyushchiy,1917-10-21,,[]


In [3]:
df_movie.release_date = pd.to_datetime(df_movie["release_date"], errors="coerce")
df_movie

Unnamed: 0,original_title,release_date,collection,genres
0,Toy Story,1995-10-30,Toy Story Collection,"['Animation', 'Comedy', 'Family']"
1,Jumanji,1995-12-15,,"['Adventure', 'Fantasy', 'Family']"
2,Grumpier Old Men,1995-12-22,Grumpy Old Men Collection,"['Romance', 'Comedy']"
3,Waiting to Exhale,1995-12-22,,"['Comedy', 'Drama', 'Romance']"
4,Father of the Bride Part II,1995-02-10,Father of the Bride Collection,['Comedy']
...,...,...,...,...
45461,رگ خواب,NaT,,"['Drama', 'Family']"
45462,Siglo ng Pagluluwal,2011-11-17,,['Drama']
45463,Betrayal,2003-08-01,,"['Action', 'Drama', 'Thriller']"
45464,Satana likuyushchiy,1917-10-21,,[]


In [4]:
# initiate llm
llm = ChatOpenAI(
    model = 'gpt-4o-mini',
    max_tokens = 1000,
    temperature=0
)

In [5]:
agent = create_pandas_dataframe_agent(
    llm,
    df = df_movie,
    verbose=True,
    allow_dangerous_code=True,
    agent_type=AgentType.OPENAI_FUNCTIONS   
)
print("Data Analysis Agent is ready. You can now ask questions about the data.")

Data Analysis Agent is ready. You can now ask questions about the data.


In [6]:
def ask_agent(question):
    """Function to ask questions to the agent and display the response"""
    response = agent.run({
        "input": question,
        "agent_scratchpad": f"Human: {question}\nAI: To answer this question, I need to use Python to analyze the dataframe. I'll use the python_repl_ast tool.\n\nAction: python_repl_ast\nAction Input: ",
    })
    print(f"Question: {question}")
    print(f"Answer: {response}")
    print("---")

In [7]:
ask_agent("what are the column names of in this dataset, show in tabular form")

  response = agent.run({




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\n\n# Assuming df is already defined as the dataframe\ncolumn_names = pd.DataFrame(df.columns, columns=['Column Names'])\ncolumn_names"}`


[0m[36;1m[1;3m     Column Names
0  original_title
1    release_date
2      collection
3          genres[0m[32;1m[1;3mHere are the column names of the dataset in tabular form:

|    | Column Names     |
|---:|:-----------------|
|  0 | original_title   |
|  1 | release_date     |
|  2 | collection       |
|  3 | genres           |[0m

[1m> Finished chain.[0m
Question: what are the column names of in this dataset, show in tabular form
Answer: Here are the column names of the dataset in tabular form:

|    | Column Names     |
|---:|:-----------------|
|  0 | original_title   |
|  1 | release_date     |
|  2 | collection       |
|  3 | genres           |
---


In [8]:
ask_agent("What is the timeperiod of movies, give in year")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df['release_date'] = pd.to_datetime(df['release_date'])\nstart_year = df['release_date'].dt.year.min()\nend_year = df['release_date'].dt.year.max()\n(start_year, end_year)"}`


[0m[36;1m[1;3m(np.float64(1874.0), np.float64(2020.0))[0m[32;1m[1;3mThe time period of the movies in the dataframe spans from the year 1874 to 2020.[0m

[1m> Finished chain.[0m
Question: What is the timeperiod of movies, give in year
Answer: The time period of the movies in the dataframe spans from the year 1874 to 2020.
---


In [9]:
ask_agent("get the number of movies per each year, plot a bar graph using plotly, don't give me code")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\nimport plotly.express as px\n\n# Convert release_date to datetime\n\ndf['release_date'] = pd.to_datetime(df['release_date'])\n\n# Extract year from release_date\n\ndf['year'] = df['release_date'].dt.year\n\n# Count the number of movies per year\nmovies_per_year = df['year'].value_counts().reset_index()\nmovies_per_year.columns = ['year', 'number_of_movies']\n\n# Create a bar graph using plotly\nfig = px.bar(movies_per_year, x='year', y='number_of_movies', title='Number of Movies per Year')\nfig.show()"}`


[0m

[36;1m[1;3m[0m[32;1m[1;3mI have generated a bar graph showing the number of movies released each year based on the data in your dataframe. You can view the graph to analyze the trends in movie releases over the years.[0m

[1m> Finished chain.[0m
Question: get the number of movies per each year, plot a bar graph using plotly, don't give me code
Answer: I have generated a bar graph showing the number of movies released each year based on the data in your dataframe. You can view the graph to analyze the trends in movie releases over the years.
---
