In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [2]:
# Step 2. Load 2022 data
df = pd.read_csv("../Data/df_2022.csv", parse_dates=['started_at','ended_at','date'])

In [11]:
# Wrangling data
df.dtypes

end_lat                      float64
end_lng                      float64
end_station_id                object
end_station_name              object
ended_at              datetime64[ns]
member_casual                 object
ride_id                       object
rideable_type                 object
start_lat                    float64
start_lng                    float64
start_station_id              object
start_station_name            object
started_at            datetime64[ns]
start_station_type            object
end_station_type              object
date                  datetime64[ns]
PRCP                         float64
TAVG                         float64
TMAX                         float64
TMIN                         float64
dtype: object

In [12]:
# Create a month column 

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [13]:
# Create the season column

df['season'] = [
"winter" if (month == 12 or 1 <= month <= 4)
    else "spring" if (4 < month <= 5)
    else "summer" if (6 <= month <= 9)
    else "fall"
for month in df['month']
    ]

In [14]:
df.shape

(598215, 22)

In [15]:
df.columns

Index(['end_lat', 'end_lng', 'end_station_id', 'end_station_name', 'ended_at',
       'member_casual', 'ride_id', 'rideable_type', 'start_lat', 'start_lng',
       'start_station_id', 'start_station_name', 'started_at',
       'start_station_type', 'end_station_type', 'date', 'PRCP', 'TAVG',
       'TMAX', 'TMIN', 'month', 'season'],
      dtype='object')

In [17]:
# Renaming column names
df.rename(columns={
    'member_casual': 'user_type',
    'rideable_type': 'bike_type',
    'start_lat': 'start_latitude',
    'start_lng': 'start_longitude',
    'end_lat': 'end_latitude',
    'end_lng': 'end_longitude',
    'PRCP': 'precipitation',
    'TAVG': 'temp_avg',
    'TMAX': 'temp_max',
    'TMIN': 'temp_min'
}, inplace=True)

In [19]:
df.columns

Index(['end_latitude', 'end_longitude', 'end_station_id', 'end_station_name',
       'ended_at', 'user_type', 'ride_id', 'bike_type', 'start_latitude',
       'start_longitude', 'start_station_id', 'start_station_name',
       'started_at', 'start_station_type', 'end_station_type', 'date',
       'precipitation', 'temp_avg', 'temp_max', 'temp_min', 'month', 'season'],
      dtype='object')

In [20]:
# Step 3. Creating Plotty Charts
## Groupby

df['value'] = 1 
df_groupby_bar = df.groupby('start_station_name', as_index=False).agg({'value': 'sum'})
top20 = df_groupby_bar.nlargest(20, 'value')

In [21]:
fig = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value']))
fig.show()

In [22]:
import plotly.graph_objects as go

# Adding a count column
df['value'] = 1

# Aggregating by start station
df_groupby_bar = df.groupby('start_station_name', as_index=False).agg({'value': 'sum'})

# Selecting top 20 stations
top20 = df_groupby_bar.nlargest(20, 'value')

# Sorting for plotting
top20 = top20.sort_values('value', ascending=True)

# Creating horizontal bar chart for better readability
fig = go.Figure(go.Bar(
    x=top20['value'],
    y=top20['start_station_name'],
    orientation='h',
    marker_color='teal'  # change color as desired
))

# Updating layout
fig.update_layout(
    title='Top 20 Starting Stations by Number of Trips (2022)',
    xaxis_title='Number of Trips',
    yaxis_title='Start Station',
    yaxis=dict(tickmode='linear'),
    template='plotly_white',
    margin=dict(l=150)  # leave space for long station names
)

fig.show()

In [23]:
top20.to_csv('top20.csv', index=False)

In [24]:
# Step 4. Creating a dual axis chart

# Aggregate number of trips per day
df_daily = df.groupby('date', as_index=False).agg({'value': 'sum'})  # 'value' = 1 per trip

df_daily.rename(columns={'value': 'bike_rides_daily'}, inplace=True)

df_daily.head()


Unnamed: 0,date,bike_rides_daily
0,2022-01-01,592
1,2022-01-02,1248
2,2022-01-03,832
3,2022-01-04,934
4,2022-01-05,914


In [27]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Step 1: Aggregate daily trips
df['value'] = 1
df_daily = df.groupby('date', as_index=False).agg({'value': 'sum'})
df_daily.rename(columns={'value': 'bike_rides_daily'}, inplace=True)

# Step 2: Merge with daily average temperature (assuming 'temp_avg' exists per date)
# If temp_avg already in df_daily, skip merging
df_temp = df.groupby('date', as_index=False).agg({'temp_avg': 'mean'})
df_daily = df_daily.merge(df_temp, on='date', how='left')

# Step 3: Create dual-axis line chart
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Line for daily bike rides
fig.add_trace(
    go.Scatter(x=df_daily['date'], y=df_daily['bike_rides_daily'], name='Daily Trips', line=dict(color='teal')),
    secondary_y=False
)

# Line for temperature
fig.add_trace(
    go.Scatter(x=df_daily['date'], y=df_daily['temp_avg'], name='Avg Temp', line=dict(color='orange')),
    secondary_y=True
)

# Layout settings
fig.update_layout(
    title='Daily Bike Rides vs Average Temperature (2022)',
    xaxis_title='Date',
    template='plotly_white'
)

# Axis titles
fig.update_yaxes(title_text='Number of Trips', secondary_y=False)
fig.update_yaxes(title_text='Average Temperature (°C)', secondary_y=True)

fig.show()