# NYC Weather Data 2023 Scrapped and Cleaning Notebook
## Lauren Brodsky 

In [7]:
# Packages used:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from calendar import monthrange 

# Scrape data 

In [10]:
# NOTE: code takes about 25-30 minutes to run 
# ALSO NOTE: must have a very strong wifi connection for webscraping otherwise it will timeout

# Set the year 
year = 2023

def weather_data():
    options = Options() # initializes browser 
    options.add_argument("--window-size=1920,1080") # set window size 
    service = Service(ChromeDriverManager().install()) # this  automatically sets up the correct chromedriver 
    driver = webdriver.Chrome(service=service, options=options) # launches the chrome browser
    driver.implicitly_wait(2) # wait for the element to load 
    
    # create  a list to store data 
    data_list = [] 
    
    for month in range(1, 13):  # loop through the 12 months
        days_in_month = monthrange(year, month)[1]  # gets the number of days in the month using the package 
        
        for day in range(1, days_in_month + 1):  # creates a loop for each day of the month aftering finding the correct number of days for that month
            url = f"https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/{year}-{month}-{day}" 
            driver.get(url)
            time.sleep(2)  # waits for the page to load completely

            try:
                # gets  average temperature using the full xpath on the website
                temp_xpath = "/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[1]/tr[3]/td[1]"
                temperature = driver.find_element(By.XPATH, temp_xpath).text.strip()

                # gets precipitation using the xpath  
                precip_xpath = "/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[1]"
                precipitation = driver.find_element(By.XPATH, precip_xpath).text.strip()

                # gets wind speed using xpath 
                wind_xpath = "/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[4]/tr[1]/td[1]"
                wind_speed = driver.find_element(By.XPATH, wind_xpath).text.strip()
       
                # set an exception in case it can't find data >> (code won't use but, it's good to have just in case)
                data_list.append((year, month, day, temperature, precipitation, wind_speed))
            except Exception as e:
                print(f"Failed to find data for {year}-{month}-{day}: {e}") # Says which date

    driver.quit() # close webpage
    return data_list

collected_data = weather_data() # calls the weather function and stores the returned data

# convert to df and name columns
if collected_data:
    df = pd.DataFrame(collected_data, columns=['Year', 'Month', 'Day', 'Avg Temperature', 'Precipitation(in)', 'Wind Speed(mph)'])
    print(df) # print to check the data was collected correctly
else:
    print("No data collected")

     Year  Month  Day Avg Temperature Precipitation(in) Wind Speed(mph)
0    2023      1    1           51.77              0.33              18
1    2023      1    2           52.32              0.00              10
2    2023      1    3           49.17              0.02               9
3    2023      1    4           53.79              0.41              14
4    2023      1    5           45.52              0.03              13
..    ...    ...  ...             ...               ...             ...
360  2023     12   27            44.6              0.00              20
361  2023     12   28           49.71              1.31              29
362  2023     12   29            48.7              0.09              17
363  2023     12   30            43.5              0.00              20
364  2023     12   31           42.12              0.00              20

[365 rows x 6 columns]


In [12]:
# convert data to csv file for cleaning
df.to_csv('scraped_data.csv', index=False)

# Clean Data

In [15]:
# combine year month and day to a string (note: I will fix data types later) 
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']]).dt.strftime('%-m/%-d/%y')  # format so dates looks like 1/2/23 

# drop the old columns 
df.drop(['Year', 'Month', 'Day'], axis=1, inplace=True)

# save the new df to a new csv file 
df.to_csv('formatted_scraped_data.csv', index=False)

In [17]:
# Check data types 
# laod the formatted data in
df = pd.read_csv('formatted_scraped_data.csv')
# print the data types 
print(df.dtypes)

Avg Temperature      float64
Precipitation(in)    float64
Wind Speed(mph)        int64
Date                  object
dtype: object


In [19]:
# convert date to datetime 
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y', errors='coerce')
print(df.dtypes)

Avg Temperature             float64
Precipitation(in)           float64
Wind Speed(mph)               int64
Date                 datetime64[ns]
dtype: object


In [21]:
# check for missing values  >> none after checking
missing_data = df.isnull().sum()
missing_data 

Avg Temperature      0
Precipitation(in)    0
Wind Speed(mph)      0
Date                 0
dtype: int64

In [23]:
# check for duplicates  >> none after checking 
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
num_duplicates

0

# Save the Final Scraped Data

In [26]:
df.to_csv('final_weather_data_scraped.csv', index=False)