In [1]:
#### BOOK FOR WEATHER SCRAPPING

### Using Open Weather's API - Free Account - Limited to 1000 free calls per day- only 2000 calls per day total
### After 1000 in a day billed at $0.15 per 100 calls

API_KEY = 'e85f2b341782ffd493fe2e354727db0b'

### Link to docs https://openweathermap.org/api/one-call-3#history-how

## File PATH

## Scraped data for games with HR stats with full team names
path = 'data/NCAA_D1/ESPN_HR_data_baseball_scrape.csv'



## Historical weather data
To learn about how get access to historical weather data for any timestamp from 1st January 1979 till now, please use this section of the documentation.

If you are interested in current weather data, forecasts and weather alerts please read the "Current and forecast weather data" section.
If you are interested in daily aggregated historical weather data, please read the "History Daily Aggregation" section of documentation.

## How to make an API call
### API call

https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={time}&appid={API key}

### Parameters
#### lat	
required	Latitude, decimal (-90; 90). If you need the geocoder to automatic convert city names and zip-codes to geo coordinates and the other way around, please use our Geocoding API

#### lon	
required	Longitude, decimal (-180; 180). If you need the geocoder to automatic convert city names and zip-codes to geo coordinates and the other way around, please use our Geocoding API

#### dt
	required	Timestamp (Unix time, UTC time zone), e.g. dt=1586468027. Data is available from January 1st, 1979.

#### appid
	required	Your unique API key (you can always find it on your account page under the "API key" tab)

#### units
	optional	Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default. Learn more
#### lang	
    optional	You can use the lang parameter to get the output in your language. Learn more

Please note that the one API response contains historical weather data for only one specified timestamp.

In [3]:
## Load Libraries
import requests
import json
import pandas as pd
import numpy as np
import datetime

from tqdm import tqdm

## Load Data
df = pd.read_csv(path)

In [None]:
## Look up a game from Founders Park in the data

# df.loc[df['location'] == 'Founders Park']

# df.head()

In [None]:
## Change the names of the problem parks that couldn't be found in the google api
# Assuming your DataFrame is named 'df'
substitutions = {
    'Mac Nease Baseball Park': 'Russ Chandler Stadium',
    'Jim Patterson Stadium': 'Jim Patterson Stadium Louisville',
    'Founders Park': 'Founders Park South Carolina',
    'Eddie Pellagrini Baseball Diamond': 'John Shea Field (Demolished)',
    'FedExPark Avron Fogelman Field': 'FedEx Park Memphis',
    'Riders Field': 'Riders Field Frisco'
}

df['location'] = df['location'].replace(substitutions)





In [None]:
# df.info()

In [None]:
print(len(df['location'].unique()))

df['location'].value_counts()


In [None]:
df.head()

## Convert date, time and year columns into the correct format
import pandas as pd
from datetime import datetime

# Assuming your DataFrame is named 'df'
df['datetime'] = pd.to_datetime(df['year'].astype(str) + ' ' + df['date'] + ' ' + df['time'])
df['unix_timestamp'] = df['datetime'].apply(lambda x: int(x.timestamp()))

# Alternatively, if you only want the date, year, and time columns as Unix timestamp
# without creating a new 'datetime' column, you can use the following code:
# df['unix_timestamp'] = pd.to_datetime(df['year'].astype(str) + ' ' + df['date'] + ' ' + df['time']).apply(lambda x: int(x.timestamp()))



In [None]:
# df.head()

In [None]:
### Get a list of all of the locations in the HR dataset and use google api to get a lat and long for each location

## get all unique locations
locations = df['location'].unique()

len(locations)

In [None]:


import googlemaps

# Your API Key goes here
gmaps = googlemaps.Client(key='AIzaSyA_BhlTupRdBPBhRptQuR6pYorMVYQnRMA')


# Create a list to store the results
results = []

# Loop through all locations
for location in tqdm(locations):
    # Geocode location
    geocode_result = gmaps.geocode(location)
    # If a result was returned, append the result as a dictionary to the results list
    if geocode_result:
        latitude = geocode_result[0]['geometry']['location']['lat']
        longitude = geocode_result[0]['geometry']['location']['lng']
        results.append({'location': location, 'latitude': latitude, 'longitude': longitude})
    else:
        print(f"Could not find coordinates for {location}.")

# Create a DataFrame from the results
df_locations = pd.DataFrame(results)

# Print the DataFrame

print(df_locations)

In [None]:
# df['latitude'].isnull().sum()

In [None]:
## Merge the lat and long data with the original dataset
df = df.merge(df_locations, on='location', how='left')


In [None]:
# df.head()

In [None]:
df['latitude'].isnull().sum()

len(df)

In [None]:
break

In [None]:
### Filter the dataframe to create a sample to send throught he API call
## drop any rows with not lat or long
df = df.dropna(subset=['latitude', 'longitude'])

# df = df.sample(5)


In [4]:
### TEMPORARY BLOCK - Load the partially filled dataframe, filter for those that don't have results and then run the API call on those rows
## Load the partial file

## jbancroftsmith - API KEY
# API_KEY = '9e69059f84d17c13de92f5bb94f4fb8'
df = pd.read_csv('data/NCAA_D1/ESPN_HR_data_baseball_scrape_with_weather_partial_v1.csv')

# df.columns
## Split into dataframes with reults and without
df_with_results = df.dropna(subset=['weather_data'])

print(len(df_with_results))

df_without_results = df[df['weather_data'].isnull()]
print(len(df_without_results))

1039
695


In [5]:
## rename the frame with no results to the default name to pass to the API call
df = df_without_results

In [6]:
### Make an API call to get the weather data
from time import sleep
import time

## Create a list to store the results
results = []

# Loop through all locations
for index, row in tqdm(df.iterrows()):
    ## Delay to limit rate of API calls
    time.sleep(.25)
    # Get the latitude and longitude for the current row
    latitude = row['latitude']
    longitude = row['longitude']
    # Get the Unix timestamp for the current row
    unix_timestamp = row['unix_timestamp']
    # Create the API request URL
    url = f'https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latitude}&lon={longitude}&dt={unix_timestamp}&appid={API_KEY}&units=imperial'
    # Make the API request
    response = requests.get(url)
    # If the response was successful, append the JSON object to the results list
    if response.status_code == 200:
        results.append(response.json())
    # If the response was unsuccessful, print the response code
    else:
        results.append('failed')
        print(f"Could not get data for row {index}. Response code: {response.status_code}")
        print(response.text)

    # Store the reults in original dataframe
df['weather_data'] = results

df.head()





695it [15:32,  1.34s/it]


Unnamed: 0,location,date,time,team_1,team_2,runs_1,hits_1,errors_1,home_runs_1,runs_2,hits_2,errors_2,home_runs_2,home_runs,year,datetime,unix_timestamp,latitude,longitude,weather_data
1039,Florida Ballpark at Alfred A. McKethan Field,June 3,1:00 PM,Liberty Flames,Oklahoma Sooners,3,10,2,0,16,17,0,3,3.0,2022,2022-06-03 13:00:00,1654261200,27.664827,-81.515754,"{'lat': 27.6648, 'lon': -81.5158, 'timezone': ..."
1040,Baum-Walker Stadium,June 2,8:00 PM,Arkansas Razorbacks,Southern Miss Golden Eagles,10,13,1,2,2,5,1,1,3.0,2018,2018-06-02 20:00:00,1527969600,36.049888,-94.182241,"{'lat': 36.0499, 'lon': -94.1822, 'timezone': ..."
1041,Jack Coombs Field,May 12,6:00 PM,Georgia Tech Yellow Jackets,Duke Blue Devils,8,9,1,3,5,10,0,1,4.0,2023,2023-05-12 18:00:00,1683914400,35.998086,-78.944236,"{'lat': 35.9981, 'lon': -78.9442, 'timezone': ..."
1042,Hawkins Field,April 28,7:00 PM,Kentucky Wildcats,Vanderbilt Commodores,4,6,0,0,6,9,0,1,1.0,2023,2023-04-28 19:00:00,1682708400,36.143398,-86.807393,"{'lat': 36.1434, 'lon': -86.8074, 'timezone': ..."
1043,Hawkins Field,April 11,7:00 PM,North Alabama Lions,Vanderbilt Commodores,2,2,1,0,14,13,1,0,0.0,2023,2023-04-11 19:00:00,1681239600,36.143398,-86.807393,"{'lat': 36.1434, 'lon': -86.8074, 'timezone': ..."


In [7]:
print(len(results))

# Assuming the API calls were successful for the first 1039 rows

# Check the length of the results
print(len(results))

## Apply the reults to the dataframe
df['weather_data'] = results

# Apply the results to the corresponding subset of rows in the DataFrame
# df.loc[:1038, 'weather_data'] = results



695
695


In [None]:
print(results)

In [8]:
### append the dataframe with new results to the one that already has them 
df = df.append(df_with_results)

len(df)

  df = df.append(df_with_results)


1734

In [24]:
df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1734 entries, 1039 to 1038
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location         1734 non-null   object 
 1   date             1734 non-null   object 
 2   time             1734 non-null   object 
 3   team_1           1734 non-null   object 
 4   team_2           1734 non-null   object 
 5   runs_1           1734 non-null   int64  
 6   hits_1           1734 non-null   int64  
 7   errors_1         1734 non-null   int64  
 8   home_runs_1      1734 non-null   int64  
 9   runs_2           1734 non-null   int64  
 10  hits_2           1734 non-null   int64  
 11  errors_2         1734 non-null   int64  
 12  home_runs_2      1734 non-null   int64  
 13  home_runs        1734 non-null   float64
 14  year             1734 non-null   int64  
 15  datetime         1734 non-null   object 
 16  unix_timestamp   1734 non-null   int64  
 17  latitude   

TypeError: string indices must be integers

In [None]:
# df.sample(5)

In [15]:
## Show one of the weather data objects

print(json.dumps(results[1], indent=4, sort_keys=True))



{
    "data": [
        {
            "clouds": 75,
            "dew_point": 73.29,
            "dt": 1527969600,
            "feels_like": 95.31,
            "humidity": 63,
            "pressure": 1013,
            "sunrise": 1527937264,
            "sunset": 1527989325,
            "temp": 87.44,
            "visibility": 10000,
            "weather": [
                {
                    "description": "haze",
                    "icon": "50d",
                    "id": 721,
                    "main": "Haze"
                }
            ],
            "wind_deg": 30,
            "wind_speed": 9.22
        }
    ],
    "lat": 36.0499,
    "lon": -94.1822,
    "timezone": "America/Chicago",
    "timezone_offset": -18000
}


In [10]:
### Save the Dataframe to a csv file witht he results column as a json object
df.to_csv('data/NCAA_D1/ESPN_HR_data_baseball_scrape_with_weather.csv', index=False)

In [None]:
### Save the Dataframe to a csv apfter trying to parse the weather data
df.to_csv('data/NCAA_D1/ESPN_HR_data_baseball_scrape_with_weather_parsed.csv', index=False)