Import Python libraries first

In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

ETL framework

In [None]:
df = pd.read_csv("https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_station/USW00023190.csv.gz",
                 compression = 'gzip',
                 names = ['station','date','datatype','value','U1','U2','U3','U4'],
                 low_memory = False)

df["date"] = pd.to_datetime(df["date"].astype(str))
df = df.loc[(df["datatype"].isin(["TMIN","TMAX","PRCP"])) & (df["date"] >= "1970-01-01"), ["date", "datatype", "value"]]

df["value"] = np.where(np.array(df["datatype"]) == "PRCP", np.round(np.array(df["value"])/100/2.54, 2), np.round(np.array(df["value"])/10 * 1.8 + 32, 0))
df = df.pivot(index = "date", columns = "datatype", values = "value").dropna().rename_axis(None, axis = 1).reset_index(inplace = False)

df["TAVG"] = (df["TMIN"] + df["TMAX"]) / 2
df = df.rename(columns = {"date": "Date", "TMAX": "High Temperature", "TMIN": "Low Temperature", "TAVG": "Average Temperature", "PRCP": "Rainfall"})
df = df.tail(1000) # to match with Objects in Excel

Store Average Temperature in NumPy array object

In [None]:
X = np.array(df["Average Temperature"]).reshape(-1,1)
print(X)

Run anomaly detection algorithm by determining points outside of these clusters

In [None]:
clusters = DBSCAN(eps = 2, min_samples = 14).fit_predict(X)
print(clusters)

Check object type for cluster output

In [None]:
type(clusters)

Not an anomaly: 0

In [None]:
np.max(clusters)

Anomaly: -1

In [None]:
np.min(clusters)

Add new column to flag anomalies

In [None]:
df = df.assign(Anomaly = clusters)
df.tail()