import Python libraries first

In [None]:
%pip install pandas
%pip install numpy

In [None]:
import pandas as pd
import numpy as np

# ETL framework

## Extract

 Santa Barbara Airport weather data comes from NOAA FTP folders

In [None]:
url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_station/USW00023190.csv.gz"
df = pd.read_csv(url,
                 compression = 'gzip',
                 names = ['station','date','datatype','value','U1','U2','U3','U4'],
                 low_memory = False)

## Transform

Make df DataFrame object into a useable data table

In [None]:
df["date"] = pd.to_datetime(df["date"].astype(str))
df = df.loc[(df["datatype"].isin(["TMIN","TMAX"])) 
            & (df["date"] >= "1970-01-01"), ["date", "datatype", "value"]]
df["value"] = np.round(np.array(df["value"])/10 * 1.8 + 32, 0)
df = df.pivot(index = "date", 
              columns = "datatype", 
              values = "value") \
                .dropna().rename_axis(None, axis = 1). \
                  reset_index(inplace = False)
df["TAVG"] = (df["TMIN"] + df["TMAX"]) / 2
df = df.rename(columns = {"date": "Date", 
                          "TMAX": "High Temperature", 
                          "TMIN": "Low Temperature", 
                          "TAVG": "Average Temperature"})

# Excel translations start here

Analyze transformed DataFrame object variable df

Get first 5 rows

In [None]:
df.head()

Get last 1000 rows

In [None]:
df.tail(1000)

Determine dimensions (number of rows and columns)

In [None]:
df.shape

Get column names

In [None]:
df.columns

Get data types for each column

In [None]:
df.dtypes