## Importing Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import os
import json
import warnings
from pprint import pprint
warnings.filterwarnings("ignore")
from test_api_key import weather_api_key

## Creating the URL for API

In [2]:

url = "http://api.openweathermap.org/data/2.5/weather?"
city = "New York"
units = "imperial"


## Getting the JSON to analyze the data format

In [3]:
# Build query URL and request your results in Celsius
query_url = f"{url}appid={weather_api_key}&q={city}&units={units}"

# Get weather data
weather_response = requests.get(query_url)
weather_json = weather_response.json()
pprint(weather_json)

{'base': 'stations',
 'clouds': {'all': 100},
 'cod': 200,
 'coord': {'lat': 40.7143, 'lon': -74.006},
 'dt': 1676247582,
 'id': 5128581,
 'main': {'feels_like': 37.58,
          'humidity': 49,
          'pressure': 1012,
          'temp': 43.29,
          'temp_max': 45.05,
          'temp_min': 39.45},
 'name': 'New York',
 'sys': {'country': 'US',
         'id': 4610,
         'sunrise': 1676202856,
         'sunset': 1676240785,
         'type': 1},
 'timezone': -18000,
 'visibility': 10000,
 'weather': [{'description': 'overcast clouds',
              'icon': '04n',
              'id': 804,
              'main': 'Clouds'}],
 'wind': {'deg': 60, 'speed': 10.36}}


## Reading the sample to be worked upon from Phase I

In [4]:
reduced_sample = pd.read_csv('Resources/reduced_sample.csv')
reduced_sample

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,date,time,Distance (km),Distance (miles),Time Interval,fare_amount_group,Year,Month,Timestamp
0,2012-01-31 07:53:52,-73.957249,40.783054,-73.975848,40.755164,1,10.5,2012-01-31,07:53:52,3.472130,2.157487,7 am - 10 am,0 to 100,2012,Jan,1327996432
1,2011-01-26 01:14:51,-73.993449,40.727387,-73.980559,40.746704,1,6.1,2011-01-26,01:14:51,2.405388,1.494643,12 am- 4 am,0 to 100,2011,Jan,1296004491
2,2012-03-12 08:00:08,-73.981595,40.740777,-73.955755,40.785052,0,12.9,2012-03-12,08:00:08,5.379335,3.342572,7 am - 10 am,0 to 100,2012,March,1331539208
3,2013-03-15 10:27:56,-73.988791,40.749692,-73.971666,40.759037,1,9.5,2013-03-15,10:27:56,1.776659,1.103967,10 am - 1 pm,0 to 100,2013,March,1363343276
4,2009-03-27 17:46:27,-73.991982,40.754068,-73.985752,40.761649,1,6.7,2009-03-27,17:46:27,0.992324,0.616603,5 pm - 8 pm,0 to 100,2009,March,1238175987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,2013-02-20 16:53:00,-73.973503,40.779663,-73.975750,40.748960,1,16.5,2013-02-20,16:53:00,3.417112,2.123300,1 pm - 5 pm,0 to 100,2013,Feb,1361379180
2896,2010-06-09 14:46:34,-74.002181,40.728612,-74.011305,40.708356,1,8.1,2010-06-09,14:46:34,2.378511,1.477942,1 pm - 5 pm,0 to 100,2010,Jun,1276094794
2897,2011-05-19 02:15:24,-73.982342,40.727718,-74.005753,40.740153,4,6.5,2011-05-19,02:15:24,2.407402,1.495894,12 am- 4 am,0 to 100,2011,May,1305771324
2898,2012-06-27 07:31:00,-73.949298,40.776752,-73.953920,40.765150,6,6.1,2012-06-27,07:31:00,1.346674,0.836786,7 am - 10 am,0 to 100,2012,Jun,1340782260


## Creating a short sample of 5 rows to check the data retrieval code from API

In [5]:
short_sample = reduced_sample.sample(n=5).reset_index(drop=True)
short_sample

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,date,time,Distance (km),Distance (miles),Time Interval,fare_amount_group,Year,Month,Timestamp
0,2014-01-07 11:43:59,-73.968299,40.791886,-73.99125,40.756338,1,13.5,2014-01-07,11:43:59,4.397164,2.732278,10 am - 1 pm,0 to 100,2014,Jan,1389095039
1,2014-11-17 09:06:57,-73.970677,40.783309,-73.953103,40.78863,1,8.5,2014-11-17,09:06:57,1.592501,0.989537,7 am - 10 am,0 to 100,2014,Nov,1416215217
2,2014-12-23 09:32:56,-73.954784,40.765706,-73.959419,40.760263,2,5.5,2014-12-23,09:32:56,0.719751,0.447233,7 am - 10 am,0 to 100,2014,Dec,1419327176
3,2014-12-15 20:42:00,-73.985187,40.72409,-73.997462,40.729037,1,5.0,2014-12-15,20:42:00,1.170814,0.727512,8 pm - 11 pm,0 to 100,2014,Dec,1418676120
4,2014-01-29 19:11:27,-73.98715,40.747905,-73.974196,40.791738,1,17.0,2014-01-29,19:11:27,4.991459,3.101557,5 pm - 8 pm,0 to 100,2014,Jan,1391022687


## Creating a dataframe with new columns of weather statistics where the data will be placed

In [7]:
short_sample['Clouds'] = " "
short_sample['Dew Point'] = " "
short_sample['Feels Like'] = " "
short_sample['Humidity'] = " "
short_sample['Pressure'] = " "
short_sample['Sunrise'] = " "
short_sample['Sunset'] = " "
short_sample['Temp'] = " "
short_sample['Visibility'] = " "
short_sample['Weather Description'] = " "
short_sample['Weather Icon'] = " "
short_sample['Weather ID'] = " "
short_sample['Weather Main'] = " "
short_sample['Wind Degree'] = " "
short_sample['Wind Speed'] = " " 
short_sample['Data Rain'] = " " 
short_sample['Data Snow'] = " " 


# Display sample data
short_sample

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,date,time,Distance (km),...,Temp,Visibility,Weather Description,Weather Icon,Weather ID,Weather Main,Wind Degree,Wind Speed,Data Rain,Data Snow
0,2014-01-07 11:43:59,-73.968299,40.791886,-73.99125,40.756338,1,13.5,2014-01-07,11:43:59,4.397164,...,,,,,,,,,,
1,2014-11-17 09:06:57,-73.970677,40.783309,-73.953103,40.78863,1,8.5,2014-11-17,09:06:57,1.592501,...,,,,,,,,,,
2,2014-12-23 09:32:56,-73.954784,40.765706,-73.959419,40.760263,2,5.5,2014-12-23,09:32:56,0.719751,...,,,,,,,,,,
3,2014-12-15 20:42:00,-73.985187,40.72409,-73.997462,40.729037,1,5.0,2014-12-15,20:42:00,1.170814,...,,,,,,,,,,
4,2014-01-29 19:11:27,-73.98715,40.747905,-73.974196,40.791738,1,17.0,2014-01-29,19:11:27,4.991459,...,,,,,,,,,,


## Running the API code on the sample of 10 rows

In [8]:
print("Starting weather statistics search")

for index, row in short_sample.iterrows():
    
    lat = row["pickup_latitude"]
    lng = row["pickup_longitude"]
    dt = row["Timestamp"]
    
    
    base_url = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?&lat={lat}&lon={lng}"
    units = "imperial"
    query_url = f"{base_url}&appid={weather_api_key}&units={units}&dt={dt}"


    response_json = requests.get(query_url).json()
    
    try:
        short_sample.loc[index, "Clouds"] =response_json["data"][0]["clouds"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Clouds"] = "Data not found"
    
    try:
        short_sample.loc[index, "Dew Point"] =response_json["data"][0]["dew_point"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Dew Point"] = "Data not found"
    
    try:
        short_sample.loc[index, "Feels Like"] =response_json["data"][0]["feels_like"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Feels Like"] = "Data not found"

    try:
        short_sample.loc[index, "Humidity"] =response_json["data"][0]["humidity"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Humidity"] = "Data not found"

    try:
         short_sample.loc[index, "Pressure"] =response_json["data"][0]["pressure"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Pressure"] = "Data not found"

    try:
        short_sample.loc[index, "Sunrise"] =response_json["data"][0]["sunrise"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Sunrise"] = "Data not found"

    try:
        short_sample.loc[index, "Sunset"] =response_json["data"][0]["sunset"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Sunset"] = "Data not found"

    try:
         short_sample.loc[index, "Temp"] =response_json["data"][0]["temp"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Temp"] = "Data not found"

    try:
        short_sample.loc[index, "Visibility"] =response_json["data"][0]["visibility"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Visibility"] = "Data not found"
    
    try:
        short_sample.loc[index, "Weather Description"] =response_json["data"][0]["weather"][0]["description"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Weather Description"] = "Data not found"

    try:
        short_sample.loc[index, "Weather Icon"] =response_json["data"][0]["weather"][0]["icon"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Weather Icon"] = "Data not found"

    try:
        short_sample.loc[index, "Weather ID"] =response_json["data"][0]["weather"][0]["id"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Weather ID"] = "Data not found"

    try:
        short_sample.loc[index, "Weather Main"] =response_json["data"][0]["weather"][0]["main"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Weather Main"] = "Data not found"

    try:
        short_sample.loc[index, "Wind Degree"] =response_json["data"][0]["wind_deg"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Wind degree"] = "Data not found"

    try:
        short_sample.loc[index, "Wind Speed"] =response_json["data"][0]["wind_speed"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Wind Speed"] = "Data not found"

    try:
        short_sample.loc[index, "Data Rain"] =response_json["data"][0]["data.rain"]
    except (KeyError, IndexError):
        short_sample.loc[index, "Data Rain"] = "Data not found"

    try:
        short_sample.loc[index, "Data Snow"] =response_json["data"][0]["data.snow"]
    except (KeyError, IndexError, ValueError):
        short_sample.loc[index, "Data Snow"] = "Data not found"
    
short_sample

Starting weather statistics search


Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,date,time,Distance (km),...,Temp,Visibility,Weather Description,Weather Icon,Weather ID,Weather Main,Wind Degree,Wind Speed,Data Rain,Data Snow
0,2014-01-07 11:43:59,-73.968299,40.791886,-73.99125,40.756338,1,13.5,2014-01-07,11:43:59,4.397164,...,5.16,Data not found,clear sky,01n,800,Clear,260,25.32,Data not found,Data not found
1,2014-11-17 09:06:57,-73.970677,40.783309,-73.953103,40.78863,1,8.5,2014-11-17,09:06:57,1.592501,...,39.16,804,moderate rain,10n,501,Rain,0,0.0,Data not found,Data not found
2,2014-12-23 09:32:56,-73.954784,40.765706,-73.959419,40.760263,2,5.5,2014-12-23,09:32:56,0.719751,...,44.26,6437,mist,50n,701,Mist,60,11.5,Data not found,Data not found
3,2014-12-15 20:42:00,-73.985187,40.72409,-73.997462,40.729037,1,5.0,2014-12-15,20:42:00,1.170814,...,48.02,10000,clear sky,01d,800,Clear,70,5.82,Data not found,Data not found
4,2014-01-29 19:11:27,-73.98715,40.747905,-73.974196,40.791738,1,17.0,2014-01-29,19:11:27,4.991459,...,22.15,10000,clear sky,01d,800,Clear,280,12.75,Data not found,Data not found


## Code ran successfully to retrieve the data

## Creating the dataframe for complete data with the columns that we need

In [10]:
year_wise_df = pd.read_csv('Resources/reduced_sample.csv')
year_wise_df['Clouds'] = " "
year_wise_df['Dew Point'] = " "
year_wise_df['Feels Like'] = " "
year_wise_df['Humidity'] = " "
year_wise_df['Pressure'] = " "
year_wise_df['Sunrise'] = " "
year_wise_df['Sunset'] = " "
year_wise_df['Temp'] = " "
year_wise_df['Visibility'] = " "
year_wise_df['Weather Description'] = " "
year_wise_df['Weather Icon'] = " "
year_wise_df['Weather ID'] = " "
year_wise_df['Weather Main'] = " "
year_wise_df['Wind Degree'] = " "
year_wise_df['Wind Speed'] = " " 
year_wise_df['Data Rain'] = " " 
year_wise_df['Data Snow'] = " " 

year_wise_df

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,date,time,Distance (km),...,Temp,Visibility,Weather Description,Weather Icon,Weather ID,Weather Main,Wind Degree,Wind Speed,Data Rain,Data Snow
0,2012-01-31 07:53:52,-73.957249,40.783054,-73.975848,40.755164,1,10.5,2012-01-31,07:53:52,3.472130,...,,,,,,,,,,
1,2011-01-26 01:14:51,-73.993449,40.727387,-73.980559,40.746704,1,6.1,2011-01-26,01:14:51,2.405388,...,,,,,,,,,,
2,2012-03-12 08:00:08,-73.981595,40.740777,-73.955755,40.785052,0,12.9,2012-03-12,08:00:08,5.379335,...,,,,,,,,,,
3,2013-03-15 10:27:56,-73.988791,40.749692,-73.971666,40.759037,1,9.5,2013-03-15,10:27:56,1.776659,...,,,,,,,,,,
4,2009-03-27 17:46:27,-73.991982,40.754068,-73.985752,40.761649,1,6.7,2009-03-27,17:46:27,0.992324,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,2013-02-20 16:53:00,-73.973503,40.779663,-73.975750,40.748960,1,16.5,2013-02-20,16:53:00,3.417112,...,,,,,,,,,,
2896,2010-06-09 14:46:34,-74.002181,40.728612,-74.011305,40.708356,1,8.1,2010-06-09,14:46:34,2.378511,...,,,,,,,,,,
2897,2011-05-19 02:15:24,-73.982342,40.727718,-74.005753,40.740153,4,6.5,2011-05-19,02:15:24,2.407402,...,,,,,,,,,,
2898,2012-06-27 07:31:00,-73.949298,40.776752,-73.953920,40.765150,6,6.1,2012-06-27,07:31:00,1.346674,...,,,,,,,,,,


## Dividing the data to run the APIs individually i.e. Karan (2009-10), Fahmida (2011-12) and Kevin (2013-14)

## Creating data set for Karan (2009-10) and writing into a CSV

In [11]:
df_2009_2010 = year_wise_df.loc[(year_wise_df["Year"] == 2009) | (year_wise_df["Year"] == 2010)]
df_2009_2010.to_csv('Resources/year_2009_2010_data.csv', index = False)

## Creating data set for Fahmida (2011-12) and writing into a CSV

In [12]:
df_2011_2012 = year_wise_df.loc[(year_wise_df["Year"] == 2011) | (year_wise_df["Year"] == 2012)]
df_2011_2012.to_csv('Resources/year_2011_2012_data.csv', index = False)

## Creating data set for Kevin (2013-14) and writing into a CSV

In [13]:
df_2013_2014 = year_wise_df.loc[(year_wise_df["Year"] == 2013) | (year_wise_df["Year"] == 2014)]
df_2013_2014.to_csv('Resources/year_2013_2014_data.csv', index = False)