# Sourcing Weather Data using NOAA API

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

In [8]:
from dotenv import load_dotenv

load_dotenv()

True

## Get weather data using NOAA's API

In [3]:
# Define your NOAA token - taken from https://www.ncdc.noaa.gov/cdo-web/token

Token = os.getenv("NOAA_TOKEN")


#### Extracting the Weather station ID of New York 
from https://www.ncdc.noaa.gov/cdo-web/datatools/findstation 
LaGuardia Airport, New York, NY, USA location have been searched for 2022.
ID found is GHCND:USW00014732

#### Compiling the API URL

The base request: https://www.ncdc.noaa.gov/cdo-web/api/v2/data?

The extra parameters need to be added after the question mark:

1. The datasetid will be Global Historical Climatology Network Daily (GHCND);

2. The datatypeid is a list of the variables that i am interested in, which in this case will be the average daily temperature (TAVG) measured in tenths of degrees Celsius. A value of 250 means 25.0Â°C, and daily total perception (PRCP) Measured in tenths of millimeters (0.1 mm). A PRCP value of 23 means 2.3 mm of precipitation that day;

3. The limit defines the maximum number of items to include in the response. The default limit is 25 and the maximum limit is 1000.

4. The stationid specifies which station the data should refer to. The station id used for the purpose of this Exercise is GHCND:USW00014732, as noted earlier.

5. The startdate and enddate refers to the date range the data should fall within 01.01.2022 - 31.12.2022.

   Example: https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00094846&startdate=2018-01-01&enddate=2018-12-31


#### Compiled URL for this Project:

https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&datatypeid=PRCP&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31

In [6]:
# Request the API from NOAA 

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&datatypeid=PRCP&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [7]:
# Load the api response as a json

d = json.loads(r.text)

In [8]:
d

{'metadata': {'resultset': {'offset': 1, 'count': 730, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'PRCP',
   'station': 'GHCND:USW00014732',
   'attributes': ',,W,2400',
   'value': 193},
  {'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'PRCP',
   'station': 'GHCND:USW00014732',
   'attributes': ',,W,2400',
   'value': 10},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'PRCP',
   'station': 'GHCND:USW00014732',
   'attributes': ',,W,2400',
   'value': 0},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'PRCP',
   'station': 'GHCND:USW00014732',
   '

## Data Wrangeling: Creating lists from dictionary 

In [9]:
# Step 1: Filtering the d dictionary to the items we are interested in

tavg = [item for item in d['results'] if item['datatype'] == 'TAVG']
prcp = [item for item in d['results'] if item['datatype'] == 'PRCP']


In [16]:
# Step 2: Extract the specific data we want from each dictionary: Dates and Values

tavg_dates = [item['date'] for item in tavg]
tavg_temp = [item['value'] for item in tavg]

prcp_dates = [item['date'] for item in prcp]
prcp_values = [item['value'] for item in prcp]


In [17]:
tavg_dates

['2022-01-01T00:00:00',
 '2022-01-02T00:00:00',
 '2022-01-03T00:00:00',
 '2022-01-04T00:00:00',
 '2022-01-05T00:00:00',
 '2022-01-06T00:00:00',
 '2022-01-07T00:00:00',
 '2022-01-08T00:00:00',
 '2022-01-09T00:00:00',
 '2022-01-10T00:00:00',
 '2022-01-11T00:00:00',
 '2022-01-12T00:00:00',
 '2022-01-13T00:00:00',
 '2022-01-14T00:00:00',
 '2022-01-15T00:00:00',
 '2022-01-16T00:00:00',
 '2022-01-17T00:00:00',
 '2022-01-18T00:00:00',
 '2022-01-19T00:00:00',
 '2022-01-20T00:00:00',
 '2022-01-21T00:00:00',
 '2022-01-22T00:00:00',
 '2022-01-23T00:00:00',
 '2022-01-24T00:00:00',
 '2022-01-25T00:00:00',
 '2022-01-26T00:00:00',
 '2022-01-27T00:00:00',
 '2022-01-28T00:00:00',
 '2022-01-29T00:00:00',
 '2022-01-30T00:00:00',
 '2022-01-31T00:00:00',
 '2022-02-01T00:00:00',
 '2022-02-02T00:00:00',
 '2022-02-03T00:00:00',
 '2022-02-04T00:00:00',
 '2022-02-05T00:00:00',
 '2022-02-06T00:00:00',
 '2022-02-07T00:00:00',
 '2022-02-08T00:00:00',
 '2022-02-09T00:00:00',
 '2022-02-10T00:00:00',
 '2022-02-11T00:

#### Creating dataframes and cleaning them

In [22]:
# Create dataframe for each datatype list
tavg_df= pd.DataFrame({ 'date': tavg_dates, 'tavg': tavg_temp })
prcp_df= pd.DataFrame({ 'date': prcp_dates, 'prcp': prcp_values })

In [20]:
tavg_df.head()

Unnamed: 0,date,tavg
0,2022-01-01T00:00:00,116
1,2022-01-02T00:00:00,114
2,2022-01-03T00:00:00,14
3,2022-01-04T00:00:00,-27
4,2022-01-05T00:00:00,32


In [21]:
prcp_df.head()

Unnamed: 0,date,prcp
0,2022-01-01T00:00:00,193
1,2022-01-02T00:00:00,10
2,2022-01-03T00:00:00,0
3,2022-01-04T00:00:00,0
4,2022-01-05T00:00:00,61


In [37]:
# Merge dataframes to have one Date column for both datatypes
weather_df= tavg_df.merge(prcp_df, on='date')

In [25]:
weather_df.head()

Unnamed: 0,date,tavg,prcp
0,2022-01-01T00:00:00,116,193
1,2022-01-02T00:00:00,114,10
2,2022-01-03T00:00:00,14,0
3,2022-01-04T00:00:00,-27,0
4,2022-01-05T00:00:00,32,61


In [28]:
weather_df.shape

(365, 3)

In [30]:
weather_df.dtypes

date    object
tavg     int64
prcp     int64
dtype: object

In [58]:
# Cleaning Date column from hour time by converting the Date type from Object to datetime
weather_df['date'] = pd.to_datetime(weather_df['date']).dt.date #this extract only the date

In [59]:
weather_df.head()

Unnamed: 0,date,tavg,prcp
0,2022-01-01,116,193
1,2022-01-02,114,10
2,2022-01-03,14,0
3,2022-01-04,-27,0
4,2022-01-05,32,61


In [60]:
# Converting the values that are in tenths of degrees Celsius and tenths of mm to normal
weather_df['tavg'] = weather_df['tavg'] / 10
weather_df['prcp'] = weather_df['prcp'] / 10
# this converts the values from integre64 to Float64(decimals)

In [61]:
weather_df.head()

Unnamed: 0,date,tavg,prcp
0,2022-01-01,11.6,19.3
1,2022-01-02,11.4,1.0
2,2022-01-03,1.4,0.0
3,2022-01-04,-2.7,0.0
4,2022-01-05,3.2,6.1


In [62]:
# Save the table
weather_df.to_csv(weather_datat, index=False)


In [3]:
weather_df= pd.read_csv(r"C:\Users\analy\Documents\Case-Study5--NewYork_Bike-sharing-service_Stategic_Analysis\Data\weather_data.csv", index_col=0)

# Combining Weather Datafram with New York City Bike 2022 Dataframe

In [12]:
# loading the citibike2022

citibike_2022_path = os.getenv("CITIBIKE_2022")
citibike_df = pd.read_csv(citibike_2022_path)


  citibike_df = pd.read_csv(citibike_2022_path)


In [13]:
citibike_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member


In [16]:
citibike_df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [17]:
citibike_df.shape

(29838806, 13)

In [18]:
# Convert the datatype of the times from Object to datetime
citibike_df['started_at'] = pd.to_datetime(citibike_df['started_at'])
citibike_df['ended_at']   = pd.to_datetime(citibike_df['ended_at'])


In [19]:
# creating Date column to use it later for merging the weather table
citibike_df['date'] = citibike_df['started_at'].dt.date


In [27]:
# Create month name column
citibike_df['month'] = citibike_df['started_at'].dt.month_name()

In [28]:
# Create weekday column
citibike_df['weekday'] = citibike_df['started_at'].dt.day_name()

In [35]:
citibike_df['start_hour'] = citibike_df['started_at'].dt.hour

In [36]:
# Creating ride duration column
citibike_df['duration'] = citibike_df['ended_at'] - citibike_df['started_at']

In [25]:
citibike_df['duration'].head()

0   0 days 00:08:48.071000
1   0 days 00:10:49.260000
2   0 days 00:13:52.131000
3   0 days 00:35:02.228000
4   0 days 00:20:34.431000
Name: duration, dtype: timedelta64[ns]

In [37]:
citibike_df['duration_min'] = citibike_df['duration'].dt.total_seconds() / 60

In [40]:
new_order = [
    'date', 'month', 'weekday','start_hour',
    'ride_id', 'member_casual', 'rideable_type',
    'started_at', 'ended_at', 'duration', 'duration_min',
    'start_station_name', 'start_station_id',
    'end_station_name', 'end_station_id',
    'start_lat', 'start_lng', 'end_lat', 'end_lng'
]


In [41]:
citibike_df = citibike_df[new_order]


In [43]:
citibike_df.head(5)

Unnamed: 0,date,month,weekday,start_hour,ride_id,member_casual,rideable_type,started_at,ended_at,duration,duration_min,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng
0,2022-01-21,January,Friday,13,BFD29218AB271154,member,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,0 days 00:08:48.071000,8.801183,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925
1,2022-01-10,January,Monday,11,7C953F2FD7BE1302,member,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,0 days 00:10:49.260000,10.821,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116
2,2022-01-26,January,Wednesday,10,95893ABD40CED4B8,member,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,0 days 00:13:52.131000,13.86885,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831
3,2022-01-03,January,Monday,8,F853B50772137378,member,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,0 days 00:35:02.228000,35.037133,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831
4,2022-01-22,January,Saturday,14,7590ADF834797B4B,member,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,0 days 00:20:34.431000,20.57385,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831


In [None]:
citibike_df.to_pickle(r"C:\Users\analy\Documents\Case-Study5--NewYork_Bike-sharing-service_Stategic_Analysis\Data\finalcitibike_2022_clean.pkl")

In [None]:
# Saving the updated formats and added columns to the citibike2022_full

citibike_2022_clean = os.getenv("CITIBIKE_2022_clean")
citibike_df.to_pickle("citibike_2022_clean.pkl")

In [None]:
# reloading the df with selected columns
citibike_2022_path = os.getenv("CITIBIKE_2022")

citibike_df1 = pd.read_csv(
    citibike_2022_path,
    usecols=[
        'date', 'month', 'weekday', 'start_hour',
        'ride_id', 'member_casual', 'rideable_type',
        'started_at', 'ended_at', 'duration', 'duration_min',
        'start_station_name', 'end_station_name'
    ]
)

In [64]:
# merging weather table with citibike table
CITIBIKE_2022_weather = pd.merge(weather_df, citibike_df1, on='date', how='inner')


In [None]:
CITIBIKE_2022_weather.shape

In [None]:
CITIBIKE_2022_weather.head()

In [88]:
# Save combined new table
CITIBIKE_2022_weather = os.getenv("CITIBIKE_2022_WEATHER")
CITIBIKE_2022_weather.to_csv(CITIBIKE_2022_weather, index=False)

