import Python libraries first

In [None]:
%pip install pandas
%pip install numpy

In [None]:
import pandas as pd
import numpy as np

# ETL framework

## extract

from NOAA FTP folders for Santa Barbara Airport weather data

In [None]:
df = pd.read_csv("https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_station/USW00023190.csv.gz",
                 compression = 'gzip',
                 names = ['station','date','datatype','value','U1','U2','U3','U4'],
                 low_memory = False)

## transform

make df DataFrame object into a useable data table

In [None]:
df["date"] = pd.to_datetime(df["date"].astype(str))
df = df.loc[(df["datatype"].isin(["TMIN","TMAX"])) & (df["date"] >= "1970-01-01"), ["date", "datatype", "value"]]
df["value"] = np.round(np.array(df["value"])/10 * 1.8 + 32, 0)
df = df.pivot(index = "date", columns = "datatype", values = "value").dropna().rename_axis(None, axis = 1).reset_index(inplace = False)
df["TAVG"] = (df["TMIN"] + df["TMAX"]) / 2
df = df.rename(columns = {"date": "Date", "TMAX": "High Temperature", "TMIN": "Low Temperature", "TAVG": "Average Temperature"})

# Create aggregated DataFrame object

Select "Date" and "Average Temperature" columns

In [None]:
df[["Date","Average Temperature"]]

Drop NAs from selected columns

In [None]:
df[["Date","Average Temperature"]] \
  .dropna()

Convert date into the end of each month and year, then group by Date as Dimension in first column, calculate mean for Average Temperatures by each monthly period

In [None]:
df[["Date","Average Temperature"]] \
  .dropna() \
    .groupby(pd.Grouper(key = 'Date', axis = 0, freq = 'ME')).mean()

Reset index of new grouped DataFrame object

In [None]:
df[["Date","Average Temperature"]] \
  .dropna() \
    .groupby(pd.Grouper(key = 'Date', axis = 0, freq = 'ME')).mean() \
      .reset_index()

Display most recent 120 monthly periods

In [None]:
df[["Date","Average Temperature"]] \
  .dropna() \
    .groupby(pd.Grouper(key = 'Date', axis = 0, freq = 'ME')).mean() \
      .reset_index() \
        .tail(120)

# Set index column to field in DataFrame

Create overall average temperatures by California city

In [None]:
temps = [["Bakersfield", 78,  53, 65.5],
        ["Burbank", 77, 55, 66],
        ["Fresno", 77,  51, 64],
        ["Long Beach", 74,  55, 64.5],
        ["Los Angeles", 70,  56, 63],
        ["Sacramento", 74,  48, 61],
        ["San Diego", 71,  57, 64],
        ["San Francisco", 66,  50, 58],
        ["Santa Barbara", 70,  49, 59.5]]
df = pd.DataFrame(temps, columns = ["Station", "High Temperature", "Low Temperature", "Average Temperature"])
df.head(10)

In [None]:
df.set_index("Station")