In [1]:
!pip install openai
!pip install chart_studio

Collecting openai
  Downloading openai-1.3.8-py3-none-any.whl.metadata (17 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.8.0-py3-none-any.whl (20 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.2-py3-none-any.whl.metadata (6.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading openai-1.3.8-py3-none-any.whl (221 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.5/221.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m2.1 M

In [2]:
import numpy as np
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import psycopg2
from sqlalchemy import create_engine
import os
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import chart_studio.plotly as py
#import plotly.offline as pyo 
#pyo.init_notebook_mode() ## ensures that the plotly graphics convert to HTML
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
from dash import dash_table
import dash_bootstrap_components as dbc
import pymongo
from bson.json_util import loads, dumps
mongo_username = os.getenv('MONGO_INITDB_ROOT_USERNAME')
mongo_password = os.getenv('MONGO_INITDB_ROOT_PASSWORD')
mongo_init_db = os.getenv('MONGO_INITDB_DATABASE')
openaikey = os.getenv('openai')
import wordcloud
import matplotlib.pyplot as plt
from wordcloud import wordcloud
import string
from openai import OpenAI
import requests
from PIL import Image
from io import BytesIO
import base64
import json
#import geopandas as gpd
#import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable


In [3]:
engine = create_engine('postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}'.format(
    user = 'postgres',
    password = POSTGRES_PASSWORD,
    host = 'postgres',
    port = 5432,
    db = 'bird'
))

# return birdlist and drop duplicate

In [4]:
myquery = f'''
    WITH af_1_selected AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision
        FROM
            bird
    ),
    
    count_per_year AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision,
            COUNT(*) AS item_count
        FROM
            af_1_selected
        GROUP BY
            event_year,
            species_id,
            iso_subdivision
    )
    
    SELECT
        cpy.event_year,
        cpy.species_id,
        cpy.iso_subdivision,
        s.species_name,
        cpy.item_count
    FROM
        count_per_year cpy
    JOIN
        species s ON cpy.species_id = s.species_id;
    
'''
countbyyear= pd.read_sql_query(myquery, con = engine)

In [5]:
bird_list = [{'label': y, 'value': y} for x, y in zip(countbyyear['species_id'], countbyyear['species_name'])]
state_list = [{'label': x, 'value': y} for x, y in zip(countbyyear['iso_subdivision'], countbyyear['iso_subdivision'])]
def drop_duplicate_dicts(list_of_dicts):
    unique_dicts = set()
    result = []

    for d in list_of_dicts:
        # Convert the dictionary to a JSON string
        json_str = json.dumps(d, sort_keys=True)
        
        # Check if this JSON representation has already been added
        if json_str not in unique_dicts:
            unique_dicts.add(json_str)
            result.append(d)

    return result

bird_list = drop_duplicate_dicts(bird_list)
state_list = drop_duplicate_dicts(state_list)

In [6]:
iso_subdivisions = countbyyear['iso_subdivision'].unique()

In [7]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

In [8]:
mymarkdown = '''
This is my final project on bird banding dataset, I hope you can find the birds you are interested in and know about where to see them. Here are my data sources.

* [North American Bird Banding Program](https://www.sciencebase.gov/catalog/)
'''

In [10]:
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
# specification of what goes on
app.layout = html.Div(
    [
        # Stuff on Top
        html.H1("See where the birds were and where they are going!"),
        html.H2("This is a final project of University of Virginia Data Engineering Course."),
        html.H3("M.Y."),
        # Side Bar
        html.Div([
            dcc.Markdown('Please select the bird you like!'),
            dcc.Dropdown(id='species_name', options=bird_list, value='Lesser Snow Goose'),
            # New Dropdown for States
            dcc.Markdown(mymarkdown)
        ], style={'width': '24%', 'float': 'left'}),
        ###### Main bar for birds###
        html.Div([
            dcc.Tabs([
                dcc.Tab(label = 'Bird Image', children = [
                    html.Div([html.Img(id = 'birdimage', style={'height':'100%', 'width':'100%'})], style = {'width': '24%', 'float':'left'}),
                    html.Div([dcc.Graph(id = 'heatmap')], style = {'width': '74%', 'float':'right'})
                 ]),

                # dcc.Tab(label = 'Birds Change Over Time', children = [
                #         # Checklist for iso_subdivisions
                #         dcc.Checklist(
                #             id='iso_subdivision_checklist',
                #             options=[{'label': i, 'value': i} for i in iso_subdivisions],
                #             value=list(iso_subdivisions),  # Default all selected
                #             labelStyle={'display': 'block'}
                #         ),
                        
                #         # Line plot
                #         dcc.Graph(id='line-plot')
                #                     ]),

                dcc.Tab(label='Population Over Time', value='tab1', children=[
                    dcc.Checklist(
                        id='state-checklist',
                        options=[{'label': i, 'value': i} for i in iso_subdivisions],
                        value=list(iso_subdivisions),
                        labelStyle={'display': 'block'}  # Select all states by default
                    ),
                    dcc.Graph(id='population-over-time')]),   
    
                dcc.Tab(label = 'Birds Distributed Over States', children = [
                    dcc.Graph(id = 'timeslider', style = {'height': '100%', 'width': '100%'})
            ])
            ])
        ], style = {'width': '74%', 'float': 'right'})
    ]
)


@app.callback(
    Output('state-checklist', 'options'),
    [Input('species_name', 'value')]
)
def update_state_checklist(selected_species):
    # You may want to replace this with logic to fetch states based on the selected species
    available_states = countbyyear[countbyyear['species_name'] == selected_species]['iso_subdivision'].unique()
    return [{'label': state, 'value': state} for state in iso_subdivisions]

# Callback to update the line plot based on selected states
@app.callback(
    Output('population-over-time', 'figure'),
    [Input('species_name', 'value'),
     Input('state-checklist', 'value')]
)
def update_population_plot(selected_species, selected_states):
    myquery = f'''
        WITH af_1_selected AS (
            SELECT
                event_year,
                species_id,
                iso_subdivision
            FROM
                bird
        ),
        
        count_per_year AS (
            SELECT
                event_year,
                species_id,
                iso_subdivision,
                COUNT(*) AS item_count
            FROM
                af_1_selected
            GROUP BY
                event_year,
                species_id,
                iso_subdivision
        )
        
        SELECT
            cpy.event_year,
            cpy.species_id,
            cpy.iso_subdivision,
            s.species_name,
            cpy.item_count
        FROM
            count_per_year cpy
        JOIN
            species s ON cpy.species_id = s.species_id
        Where
            s.species_name = '{selected_species}';
        
    '''
    data = pd.read_sql_query(myquery, con = engine)
    filtered_data = data[(data['species_name'] == selected_species) & (data['iso_subdivision'].isin(selected_states))]
    fig = px.line(filtered_data, x='event_year', y='item_count', color='iso_subdivision', title=f'Population of {selected_species} Over Time')
    return fig

#########################################################################
@app.callback([Output(component_id = 'birdimage', component_property = 'src')],
             [Input(component_id = 'species_name', component_property = 'value')])

def birdimage(b):
    OPENAI_API_KEY=openaikey
    client = OpenAI(api_key = OPENAI_API_KEY)
    response = client.images.generate(
      model="dall-e-3",
      prompt=f"a realistic photo of {b} with no text",
      size="1024x1024",
      quality="standard",
      n=1,
    )
    image_url = response.data[0].url

    return [image_url]

#########################################################################
# @app.callback([Output('line-plot', 'figure')],
#     [Input('species_name', 'value'),
#     Input('iso_subdivision_checklist', 'value')])
# def update_graph(selected_subdivisions, selected_species):
#     myquery = f'''
#     WITH af_1_selected AS (
#         SELECT
#             event_year,
#             species_id,
#             iso_subdivision
#         FROM
#             bird
#     ),
    
#     count_per_year AS (
#         SELECT
#             event_year,
#             species_id,
#             iso_subdivision,
#             COUNT(*) AS item_count
#         FROM
#             af_1_selected
#         GROUP BY
#             event_year,
#             species_id,
#             iso_subdivision
#     )
    
#     SELECT
#         cpy.event_year,
#         cpy.species_id,
#         cpy.iso_subdivision,
#         s.species_name,
#         cpy.item_count
#     FROM
#         count_per_year cpy
#     JOIN
#         species s ON cpy.species_id = s.species_id
#     Where
#         s.species_name = '{selected_species}';
    
# '''
#     df = pd.read_sql_query(myquery, con = engine)
#     # Filter dataframe based on selected checkboxes
#     filtered_df = df[df['iso_subdivision'].isin(selected_subdivisions)]
#     filtered_df["event_year"] = sorted(filtered_df["event_year"])
#     # Create the figure
#     fig = px.line(filtered_df, x="event_year", y="item_count", color='iso_subdivision')
#     return fig


#########################################################################
@app.callback([Output(component_id = 'timeslider', component_property = 'figure')],
             [Input(component_id = 'species_name', component_property = 'value')])

def timeslider(b):
    myquery = f'''
    WITH af_1_selected AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision
        FROM
            bird
    ),
    
    count_per_year AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision,
            COUNT(*) AS item_count
        FROM
            af_1_selected
        GROUP BY
            event_year,
            species_id,
            iso_subdivision
    )
    
    SELECT
        cpy.event_year,
        cpy.species_id,
        cpy.iso_subdivision,
        s.species_name,
        cpy.item_count
    FROM
        count_per_year cpy
    JOIN
        species s ON cpy.species_id = s.species_id
    WHERE 
        s.species_name = '{b}';
    '''
    df_sel = pd.read_sql_query(myquery, con = engine)
    #df =df[df.species_name == b]
    # Create a list of all years

    years = sorted(df_sel['event_year'].unique())
    # Creating the figure
    fig = go.Figure()
    
    # Add one trace for each year
    for year in years:
        df_year = df_sel[df_sel['event_year'] == year]
        fig.add_trace(
            go.Choropleth(
                locations=df_year['iso_subdivision'],
                z=df_year['item_count'],
                text=df_year['species_name'],
                colorscale='Viridis',
                autocolorscale=False,
                showscale=True,
                #geojson = 'us-states.json',
                geojson='https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json',
                featureidkey="properties.name"
            )
        )
    
    # Make all traces invisible at the start
    for trace in fig.data:
        trace.visible = False
    
    # Make the first trace visible
    fig.data[0].visible = True
    
    # Create and add slider
    steps = []
    for i, year in enumerate(years):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(years)},
                  {"title": f"Item count for year: {year}"}],
            label=str(year)
        )
        step["args"][0]["visible"][i] = True  
        steps.append(step)
    
    sliders = [dict(
        active=0,
        currentvalue={"prefix": "Year: "},
        pad={"t": 50},
        steps=steps
    )]
    
    fig.update_layout(
        sliders=sliders,
        title='Birds population changes over time',
        height=600,  
        width=1000   )

    return [fig]

#########################################################################
@app.callback([Output(component_id = 'heatmap', component_property = 'figure')],
             [Input(component_id = 'species_name', component_property = 'value')])

def heatmap(b):
    myquery = f'''
    WITH af_1_selected AS (
    SELECT
        event_year,
        species_id,
        iso_subdivision,
        lat_dd,
        lon_dd
    FROM
        bird
),

count_per_year AS (
    SELECT
        event_year,
        species_id,
        iso_subdivision,
        lat_dd,
        lon_dd,
        COUNT(*) AS item_count
    FROM
        af_1_selected
    GROUP BY
        event_year,
        species_id,
        iso_subdivision,
        lat_dd, 
        lon_dd  
)

SELECT
    cpy.event_year,
    cpy.species_id,
    s.species_name,
    cpy.iso_subdivision,
    cpy.item_count,
    cpy.lat_dd,
    cpy.lon_dd
FROM
    count_per_year cpy
JOIN
    species s ON cpy.species_id = s.species_id
WHERE 
    s.species_name = '{b}';
    '''
    forheat = pd.read_sql_query(myquery, con = engine)
    target_time = 2022
    target_df = forheat[forheat['event_year'] == target_time]
    if not target_df.empty:
        # Create a Plotly Figure
        target_df['hover_text'] = target_df.apply(lambda row: f"Geo: {row['iso_subdivision']}, Lat: {row['lat_dd']}, Lon: {row['lon_dd']}", axis=1)
        fig = px.scatter_geo(target_df, lat='lat_dd', lon='lon_dd', 
                hover_name='hover_text', size='item_count', 
                projection='mercator', title=f'Heatmap of bird population at {target_time}')
        return [fig]
    else:
        # Handle empty DataFrame case
        return px.scatter_geo(title='No Data Available')
    
    
#########################################################################

if __name__ == "__main__":
    app.run_server(mode='external', host = '0.0.0.0', port = '8050', debug = True)

[1;31m---------------------------------------------------------------------------[0m
[1;31mAuthenticationError[0m                       Traceback (most recent call last)
File [1;32m/usr/local/lib/python3.11/site-packages/openai/resources/images.py:251[0m, in [0;36mImages.generate[1;34m(
    self=<openai.resources.images.Images object>,
    prompt='a realistic photo of Lesser Snow Goose with no text',
    model='dall-e-3',
    n=1,
    quality='standard',
    response_format=NOT_GIVEN,
    size='1024x1024',
    style=NOT_GIVEN,
    user=NOT_GIVEN,
    extra_headers=None,
    extra_query=None,
    extra_body=None,
    timeout=NOT_GIVEN
)[0m
[0;32m    193[0m [38;5;28;01mdef[39;00m [38;5;21mgenerate[39m(
[0;32m    194[0m     [38;5;28mself[39m,
[0;32m    195[0m     [38;5;241m*[39m,
[1;32m   (...)[0m
[0;32m    209[0m     timeout: [38;5;28mfloat[39m [38;5;241m|[39m httpx[38;5;241m.[39mTimeout [38;5;241m|[39m [38;5;28;01mNone[39;00m [38;5;241m|[39m NotGiv

In [11]:

    myquery = f'''
    WITH af_1_selected AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision
        FROM
            bird
    ),
    
    count_per_year AS (
        SELECT
            event_year,
            species_id,
            iso_subdivision,
            COUNT(*) AS item_count
        FROM
            af_1_selected
        GROUP BY
            event_year,
            species_id,
            iso_subdivision
    )
    
    SELECT
        cpy.event_year,
        cpy.species_id,
        cpy.iso_subdivision,
        s.species_name,
        cpy.item_count
    FROM
        count_per_year cpy
    JOIN
        species s ON cpy.species_id = s.species_id;
    
'''
    data = pd.read_sql_query(myquery, con = engine)
    # Filter dataframe based on selected checkboxes
    filtered_df = df[df['iso_subdivision'].isin(selected_subdivisions)]
    filtered_df["event_year"] = sorted(filtered_df["event_year"])
    # Create the figure
    fig = px.line(filtered_df, x="event_year", y="item_count", color='iso_subdivision')
    fig

NameError: name 'df' is not defined

In [15]:
selected_subdivisions = ['Virginia']

In [12]:
data

Unnamed: 0,event_year,species_id,iso_subdivision,species_name,item_count
0,1960,1326,Delaware,Mallard X American Black Duck Hybrid,50
1,1960,1326,Illinois,Mallard X American Black Duck Hybrid,15
2,1960,1326,Indiana,Mallard X American Black Duck Hybrid,4
3,1960,1326,Kentucky,Mallard X American Black Duck Hybrid,2
4,1960,1326,Maine,Mallard X American Black Duck Hybrid,4
...,...,...,...,...,...
27597,2022,1770,Florida,Black-bellied Whistling-Duck,32
27598,2022,1770,Louisiana,Black-bellied Whistling-Duck,884
27599,2022,1770,Mississippi,Black-bellied Whistling-Duck,498
27600,2022,1770,Texas,Black-bellied Whistling-Duck,12
