In [None]:
# Importing Dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [None]:

# Path to data files
measurements_csv = "data_files/hawaii_measurements.csv"
stations_csv = "data_files/hawaii_stations.csv"

In [None]:
# Reading files into Pandas Dataframe
hawaii_meas_df = pd.read_csv(measurements_csv, header="infer",dtype={"station":"str","date":"str","prcp":"float","tobs":"float"} )
hawaii_stat_df = pd.read_csv(stations_csv, header="infer")

In [None]:
# Verifying Data and potential cleaning actions
hawaii_meas_df.describe(include="all")

In [None]:
# Verifying Data and potential cleaning actions
hawaii_stat_df.describe(include="all")

In [None]:

# Establishing index for Stations dataframe
if "station" in hawaii_stat_df.columns:
    hawaii_stat_df.set_index("station", inplace=True)
clean_hawaii_stat_df = hawaii_stat_df

In [None]:

# Creating a set of data that finds the average precipitation by station and month
hawaii_meas_avg_df = pd.DataFrame(hawaii_meas_df.loc[:,["station","date","prcp","tobs"]])
hawaii_meas_avg_df["prcp"] = hawaii_meas_avg_df["prcp"].astype(float)
hawaii_meas_avg_df["tobs"] = hawaii_meas_avg_df["tobs"].astype(float)
hawaii_meas_avg_df["date"] = pd.to_datetime(hawaii_meas_avg_df["date"],format="%Y-%m-%d", errors="coerce")
hawaii_meas_avg_df["year"] = hawaii_meas_avg_df["date"].dt.year
hawaii_meas_avg_df["month"] = hawaii_meas_avg_df["date"].dt.month
hawaii_meas_avg_df["day"] = hawaii_meas_avg_df["date"].dt.day
if "station" in hawaii_meas_avg_df.columns:
    hawaii_meas_avg_df["station_month"] = hawaii_meas_avg_df["station"].astype(str, errors="ignore")+"_"+hawaii_meas_avg_df["month"].astype(str)
else:
    hawaii_meas_avg_df["station_month"] = hawaii_meas_avg_df.index.astype(str, errors="ignore")+"_"+hawaii_meas_avg_df["month"].astype(str)
avg_prcp_by_month = hawaii_meas_avg_df.loc[:,["station_month","prcp"]]

In [None]:

# Dropping all NaN values in "prcp" column and creating an averages group by 
# the "station_month" column
avg_prcp_by_month.dropna(inplace=True)
avg_prcp_by_month=avg_prcp_by_month.groupby(["station_month"])["prcp"].mean()

In [None]:
# Replacing all NaN values with the average "prcp" data by station and month
hawaii_meas_avg_df.prcp = round(hawaii_meas_avg_df.prcp.fillna(
    hawaii_meas_avg_df.station_month.map(avg_prcp_by_month).astype(float)),2)

In [None]:
# Evaluating if the "station" column is the index, and establishing "station" 
# as the index if not.
if "station" in hawaii_meas_avg_df.columns:
    hawaii_meas_avg_df.set_index("station", inplace=True)
    print("Index assigned to 'station' column.")
else:
    print("Index has already been assigned to'station' column.")
    
hawaii_meas_df = hawaii_meas_avg_df

In [None]:
# Saving all of the new columns added to a separate dataframe
attr_hawaii_meas_df = hawaii_meas_avg_df

# Verifying index has not been set and establishing the index for new dataframe
if "station" in attr_hawaii_meas_df.columns:
    attr_hawaii_meas_df.set_index("station", inplace=True)
else:
    ""

In [None]:
# Final cleansing of Measurement data by verifying and establishing index of "station" column
if "station" in hawaii_meas_df.columns:
    clean_hawaii_meas_df = hawaii_meas_df.loc[:,["station","date","prcp","tobs"]]
    hawaii_meas_df.set_index("station", inplace=True)
    print("Index assigned to 'station' column.")
else:
    clean_hawaii_meas_df = hawaii_meas_df.loc[:,["date","prcp","tobs"]]
    print("Index has already been assigned to'station' column.")

In [None]:
# Outputting cleaned versions of each dataframe
clean_hawaii_meas_df.to_csv(measurements_csv.replace("data_files/","data_files/CLEAN_"))
clean_hawaii_stat_df.to_csv(stations_csv.replace("data_files/","data_files/CLEAN_"))

In [None]:
# Saving output of additional fields for reference or analysis.
attr_hawaii_meas_df.to_csv("data_files/measurements_with_additional_attributes.csv")