#  Data Analysis With LLMs

This was originally based on Datacamp's code-along "Introduction to Large Language Models with GPT & LangChain"

In [None]:
# langchain useful for simplifying calls, swapp LLMs, and more
#]%pip install langchain


In [None]:
# Update the typing_extensions package
#%pip install typing_extensions==4.8.0

In [1]:
# Import the os package
import os

# Import the openai package
import openai

# Set openai.api_key to the OPENAI_API_KEY environment variable
openai.api_key = os.environ["OPENAI_KEY"]
openai.api_key

'sk-proj-gGNFM2dtXIcLjHCgDNQET3BlbkFJbOKvy2TaXYYbNfrlmmTQ'

In [2]:
# Import the langchain package as lc
import langchain as lc

# From the langchain.chat_models module, import ChatOpenAI
from langchain.chat_models import ChatOpenAI

# From the langchain.schema module, import AIMessage, HumanMessage, SystemMessage
from langchain.schema import AIMessage, HumanMessage, SystemMessage

In [3]:
# Import pandas using the alias pd
import pandas as pd

# Import plotly.express using the alias px
import plotly.express as px

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown

In [4]:
# Read the data from electric_cars.csv. Assign to electric_cars.
electric_cars = pd.read_csv("electric_cars.csv")

# Display a description of the numeric columns
print("Description of numeric columns\n")
display(electric_cars.describe())

# Display a description of the text (object) columns
print("Description of text columns\n")
display(electric_cars.describe(include="O"))

# Print the whole dataset
print("The electric cars dataset\n")
electric_cars

Description of numeric columns



Unnamed: 0,model_year,n_cars
count,26813.0,26813.0
mean,2019.375527,5.612166
std,3.286257,26.997325
min,1997.0,1.0
25%,2017.0,1.0
50%,2020.0,2.0
75%,2022.0,4.0
max,2024.0,1514.0


Description of text columns



Unnamed: 0,city,county,make,model,electric_vehicle_type
count,26813,26813,26813,26813,26813
unique,683,183,37,127,2
top,Bothell,King,TESLA,LEAF,Battery Electric Vehicle (BEV)
freq,479,7066,5071,1889,15885


The electric cars dataset



Unnamed: 0,city,county,model_year,make,model,electric_vehicle_type,n_cars
0,Seattle,King,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),1514
1,Seattle,King,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),1153
2,Seattle,King,2021,TESLA,MODEL Y,Battery Electric Vehicle (BEV),1147
3,Seattle,King,2022,TESLA,MODEL Y,Battery Electric Vehicle (BEV),1122
4,Bellevue,King,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),931
...,...,...,...,...,...,...,...
26808,Lakewood,Pierce,2022,BMW,IX,Battery Electric Vehicle (BEV),1
26809,Lakewood,Pierce,2022,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),1
26810,Lakewood,Pierce,2022,FORD,TRANSIT,Battery Electric Vehicle (BEV),1
26811,Lakewood,Pierce,2022,HYUNDAI,KONA ELECTRIC,Battery Electric Vehicle (BEV),1


### Asking GPT a question

In [5]:
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_KEY'])

# Define the system message. Assign to system_msg_test.
system_msg_test = """You are a helpful assistant who understands data science.
 You write in a clear language that a ten year old can understand.
 You keep your answers brief."""

# Define the user message. Assign to user_msg_test.
user_msg_test = "Tell me some uses of GPT for data analysis."

# Create a message list from the system and user messages. Assign to msgs_test.
msgs_test = [
    {"role": "system", "content": system_msg_test},
    {"role": "user", "content": user_msg_test}
]

# Send the messages to GPT. Assign to rsps_test.
rsps_test = completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=msgs_test
)

In [6]:
# Print the whole response
print("The whole response\n")
display(rsps_test.model_dump_json())

print("\n\n----\n\n")

# Print just the response's content
print("Just the response's content\n")
print(rsps_test.choices[0].message.content)

The whole response



'{"id":"chatcmpl-9Mv36PSXanry4ySXId68IHRssUSHv","choices":[{"finish_reason":"stop","index":0,"logprobs":null,"message":{"content":"GPT can help with data analysis by generating text explanations for data visualizations, summarizing large datasets, and even assisting with natural language queries for analyzing data.","role":"assistant","function_call":null,"tool_calls":null}}],"created":1715249540,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":null,"usage":{"completion_tokens":32,"prompt_tokens":52,"total_tokens":84}}'



----


Just the response's content

GPT can help with data analysis by generating text explanations for data visualizations, summarizing large datasets, and even assisting with natural language queries for analyzing data.


### Task 3: Asking a Question About the Dataset

In [7]:
# A description of the dataset
dataset_description = """
You have a dataset about electric cars registered in Washington state, USA in 2020. It is available as a pandas DataFrame named `electric_cars`.

Each row in the dataset represents the count of the number of cars registered within a city, for a particular model.

The dataset contains the following columns.

- `city` (character): The city in which the registered owner resides.
- `county` (character): The county in which the registered owner resides.
- `model_year` (integer): The [model year](https://en.wikipedia.org/wiki/Model_year#United_States_and_Canada) of the car.
- `make` (character): The manufacturer of the car.
- `model` (character): The model of the car.
- `electric_vehicle_type` (character): Either "Plug-in Hybrid Electric Vehicle (PHEV)" or "Battery Electric Vehicle (BEV)".
- `n_cars` (integer): The count of the number of vehicles registered.
"""

# Create a task for the AI. Assign to suggest_questions.
suggest_questions = "Suggest some data analysis questions that could be answered with this dataset."

# Concatenate the dataset description and the request. Assign to msgs_suggest_questions.
msgs_suggest_questions = [
    SystemMessage(content="You are a data analysis expert."),
    HumanMessage(content=f"{dataset_description}\n\n{suggest_questions}")
]

In [8]:
chat = ChatOpenAI(openai_api_key=os.environ['OPENAI_KEY'])
rsps_suggest_questions = chat(msgs_suggest_questions)
# Print the response
print("The whole response\n")
print(rsps_suggest_questions)

print("\n----\n")

# Print just the response's content
print("Just the response's content\n")
print(rsps_suggest_questions.content)

print("\n----\n")

# Print the type of the response
print("The type of the response\n")
print(type(rsps_suggest_questions))

  warn_deprecated(
  warn_deprecated(


The whole response

content='Here are some data analysis questions that could be answered using the provided dataset of electric cars registered in Washington state in 2020:\n\n1. What is the distribution of electric vehicle types (PHEV vs BEV) in the dataset?\n2. Which city had the highest number of electric cars registered in 2020?\n3. How many electric cars of each make were registered in Washington state in 2020?\n4. What is the average count of electric cars registered per city?\n5. Are there any specific models that are more popular among electric car owners in Washington state?\n6. How does the distribution of electric cars vary by county in Washington state?\n7. Can we identify any trends in the registration of electric cars based on the model year?\n8. What is the total number of electric cars registered in Washington state in 2020?\n9. Is there a relationship between the model year of the car and the count of registered electric cars?\n10. How does the count of electric cars 

### Hold a conversation with GPT

In [9]:
# Append the response and a new message to the previous messages. 
# Assign to msgs_python_top_models.
msgs_python_top_models = msgs_suggest_questions + [
    rsps_suggest_questions,
    HumanMessage(content="Write some Python code to find the top 5 most popular make/model combinations of electric car in Washington.")
]

# Pass your message to GPT. Assign to rsps_python_top_models.
rsps_python_top_models = chat(msgs_python_top_models)

# Display the response's Markdown content
display(Markdown(rsps_python_top_models.content))

You can find the top 5 most popular make/model combinations of electric cars in Washington by grouping the dataset by 'make' and 'model', summing the counts of registered cars, and then sorting the results in descending order. Here's the Python code to achieve this:

```python
# Group the dataset by 'make' and 'model' and sum the counts of registered cars
popular_make_model = electric_cars.groupby(['make', 'model'])['n_cars'].sum().reset_index()

# Sort the results in descending order based on the total count of registered cars
popular_make_model = popular_make_model.sort_values(by='n_cars', ascending=False)

# Get the top 5 most popular make/model combinations
top_5_popular_make_model = popular_make_model.head(5)

print(top_5_popular_make_model)
```

This code snippet will provide you with the top 5 most popular make/model combinations of electric cars in Washington state based on the total count of registered cars for each combination.

### Execute the code provided by GPT

In [10]:
# Group the data by make and model, then sum the number of cars registered for each combination
popular_make_model = electric_cars.groupby(['make', 'model'])['n_cars'].sum().reset_index()

# Sort the data by the total number of cars registered in descending order
popular_make_model = popular_make_model.sort_values(by='n_cars', ascending=False)

# Get the top 5 most popular make/model combinations
top_5_popular_make_model = popular_make_model.head(5)

print(top_5_popular_make_model)

          make    model  n_cars
108      TESLA  MODEL Y   28502
105      TESLA  MODEL 3   27708
91      NISSAN     LEAF   13186
106      TESLA  MODEL S    7611
30   CHEVROLET  BOLT EV    5733


In [11]:
# Create a new OpenAI chat object with temperature set to zero. Assign to chat0.
chat0 = ChatOpenAI(temperature=0, openai_api_key=os.environ['OPENAI_KEY'])

In [12]:
# Ask GPT for code for a bar plot, as detailed in the instructions
msgs_python_plot = msgs_python_top_models + [
    rsps_python_top_models,
    HumanMessage(content="Write some Python code to draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type. Use the Plotly Express package.")
]

rsps_python_plot = chat0(msgs_python_plot)

display(Markdown(rsps_python_plot.content))

To draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type using Plotly Express, you can use the following Python code:

```python
import plotly.express as px

# Group the dataset by 'model_year' and 'electric_vehicle_type' and sum the counts of registered cars
count_by_year_type = electric_cars.groupby(['model_year', 'electric_vehicle_type'])['n_cars'].sum().reset_index()

# Draw a bar plot using Plotly Express
fig = px.bar(count_by_year_type, x='model_year', y='n_cars', color='electric_vehicle_type',
             labels={'model_year': 'Model Year', 'n_cars': 'Total Count of Electric Cars'},
             title='Total Count of Electric Cars by Model Year (Colored by Electric Vehicle Type)')

fig.show()
```

This code snippet will create a bar plot using Plotly Express that displays the total count of electric cars by model year, with bars colored by electric vehicle type (PHEV or BEV). You can run this code in a Python environment with the Plotly package installed to visualize the data.

In [13]:
# Call GPT again with the same message list and display the response
rsps_python_plot1 = chat0(msgs_python_plot)

display(Markdown(rsps_python_plot1.content))

To draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type using Plotly Express, you can use the following Python code:

```python
import plotly.express as px

# Group the dataset by 'model_year' and 'electric_vehicle_type' and sum the counts of registered cars
count_by_year_type = electric_cars.groupby(['model_year', 'electric_vehicle_type'])['n_cars'].sum().reset_index()

# Draw a bar plot using Plotly Express
fig = px.bar(count_by_year_type, x='model_year', y='n_cars', color='electric_vehicle_type',
             labels={'model_year': 'Model Year', 'n_cars': 'Total Count of Electric Cars'},
             title='Total Count of Electric Cars by Model Year (Colored by Electric Vehicle Type)')

fig.show()
```

This code snippet will create a bar plot using Plotly Express where the bars represent the total count of electric cars by model year, and the bars are colored based on the electric vehicle type (PHEV or BEV).

In [14]:
# run the code generated by GPT
import plotly.express as px

# Group the data by model year and electric vehicle type, then sum the number of cars registered for each group
total_count_by_year_type = electric_cars.groupby(['model_year', 'electric_vehicle_type'])['n_cars'].sum().reset_index()

# Plot the bar chart using Plotly Express
fig = px.bar(total_count_by_year_type, x='model_year', y='n_cars', color='electric_vehicle_type',
             labels={'model_year': 'Model Year', 'n_cars': 'Total Count of Electric Cars'},
             title='Total Count of Electric Cars by Model Year',
             barmode='group')

fig.show()