In [None]:
# Read Datasets
import pandas as pd

temperature = pd.read_csv("historical-hourly-weather-data/temperature.csv")
wind = pd.read_csv("historical-hourly-weather-data/wind_speed.csv")
description = pd.read_csv("historical-hourly-weather-data/weather_description.csv")
cities = pd.read_csv("historical-hourly-weather-data/city_attributes.csv")["City"].tolist()

In [None]:
# Count Values
# Just temperature comparison for start, get that working
print("Temperature")
print(temperature.count())
print("Wind")
print(wind.count())
print("Description")
print(description.count())

In [None]:
# Handle Missing Values
drop_cities = ["Vancouver", "San Francisco", "Miami", "New York", "Beersheba", "Tel Aviv District", "Eilat", "Haifa", "Nahariyya", "Jerusalem"]
temperature = temperature.drop(drop_cities, axis=1).fillna(method="bfill")
wind = wind.drop(drop_cities, axis=1).fillna(method="bfill")
description = description.drop(drop_cities, axis=1).fillna(method="bfill")
cities = [city for city in cities if city not in drop_cities]
print("Temperature")
print(temperature.count())
print("Wind")
print(wind.count())
print("Description")
print(description.count())

In [None]:
# Functions

def delta(df, city_a, city_b, index):
    return abs(df[city_a][index] - df[city_b][index])

def weight_description(city_a, city_b, index):
    if description[city_a][index] == description[city_b][index]:
        return 0
    return 5

def difference(city_a, city_b, index):
    d_temperature = delta(temperature, city_a, city_b, index)
    d_wind = delta(wind, city_a, city_b, index)
    d_description = weight_description(city_a, city_b, index)
    return (d_temperature/5) + (d_wind/2) + d_description

def dtw_difference(a, b):
    d_temperature = abs(a["temperature"] - b["temperature"])
    d_wind = abs(a["wind"] - b["wind"])
    if a["description"] == b["description"]:
        d_description = 0
    else:
        d_description = 5
    return (d_temperature/5) + (d_wind/2) + d_description

def compare_linear(city_a, city_b):
    diff = 0
    for i in range(0, len(temperature["datetime"])):
        diff += difference(city_a, city_b, i)
    return diff

In [None]:
# DTW Main
import dtw

city_pairs = []
for i in range(0, len(cities)):
    for j in range(i+1, len(cities)):
        city_pairs.append([cities[i], cities[j]])

# ~15 minutes to run on i9-9980HK
#for pair in city_pairs:
#    pair.append(compare_linear(pair[0], pair[1]))

#print(city_pairs)

In [None]:
# create dictionary for each city, stored as dictionaries
data = {}
for city in cities:
    data[city] = [{"temperature": temperature[city][i], "wind": wind[city][i], "description": description[city][i]} for i in range(0, len(temperature[city])) if i % 24 == 0]

In [None]:
from dtw import dtw
#for pair in sorted(city_pairs, key=lambda x: x[2]):
i = 0.0
for pair in city_pairs:
    print(pair)
    pair.append(dtw(data[pair[0][0:500]], data[pair[1][0:500]], dtw_difference, s=.5))
    i += 1.0
    print(i / 325.0)
print(city_pairs)

In [None]:
from dtw import dtw
dtw(data["Portland"], data["Seattle"], dtw_difference, s=.5)
# Can't solve one in 5 minutes, need to use less data. 1,000 in a few seconds, 2,000 is doable as well, 4500 achieved in 1 minute

In [None]:
for pair in city_pairs:
    if len(pair) == 4:
        pair = [pair[0], pair[1], pair[3]]

In [None]:
for pair in sorted(city_pairs, key=lambda x: x[2]):
    print(pair[0], pair[1])

In [None]:
from fastdtw import fastdtw
fastdtw(data["Portland"], data["Seattle"], dist=dtw_difference)

In [1]:
from dtw import dtw, accelerated_dtw
from fastdtw import fastdtw
import pandas as pd

temperature = pd.read_csv("historical-hourly-weather-data/temperature.csv")
cities = pd.read_csv("historical-hourly-weather-data/city_attributes.csv")["City"].tolist()
drop_cities = ["Vancouver", "San Francisco", "Miami", "New York", "Beersheba", "Tel Aviv District", "Eilat", "Haifa", "Nahariyya", "Jerusalem"]
temperature = temperature.drop(drop_cities, axis=1).fillna(method="bfill")
cities = [city for city in cities if city not in drop_cities]

print("Data Loaded")

def compare_linear(city_a, city_b):
    diff = 0
    for i in range(0, len(temperature["datetime"])):
        diff += abs(city_a[i] - city_b[i])
    return diff

city_pairs = []
for i in range(0, len(cities)):
    for j in range(i+1, len(cities)):
        city_pairs.append([cities[i], cities[j]])

print("Pairs Made")


for pair in city_pairs:
    pair.append(fastdtw(temperature[pair[0]], temperature[pair[1]], radius=120))
    print(pair[0:2])

print("done")
    
print(sorted(city_pairs, key=lambda x: x[-1]))

Data Loaded
Pairs Made
['Portland', 'Seattle']
['Portland', 'Los Angeles']
['Portland', 'San Diego']
['Portland', 'Las Vegas']
['Portland', 'Phoenix']
['Portland', 'Albuquerque']
['Portland', 'Denver']
['Portland', 'San Antonio']
['Portland', 'Dallas']
['Portland', 'Houston']
['Portland', 'Kansas City']
['Portland', 'Minneapolis']
['Portland', 'Saint Louis']
['Portland', 'Chicago']
['Portland', 'Nashville']
['Portland', 'Indianapolis']
['Portland', 'Atlanta']
['Portland', 'Detroit']
['Portland', 'Jacksonville']
['Portland', 'Charlotte']
['Portland', 'Pittsburgh']
['Portland', 'Toronto']
['Portland', 'Philadelphia']
['Portland', 'Montreal']
['Portland', 'Boston']
['Seattle', 'Los Angeles']
['Seattle', 'San Diego']
['Seattle', 'Las Vegas']
['Seattle', 'Phoenix']
['Seattle', 'Albuquerque']
['Seattle', 'Denver']
['Seattle', 'San Antonio']
['Seattle', 'Dallas']
['Seattle', 'Houston']
['Seattle', 'Kansas City']
['Seattle', 'Minneapolis']
['Seattle', 'Saint Louis']
['Seattle', 'Chicago']
['Se

['Jacksonville', 'Toronto']
['Jacksonville', 'Philadelphia']
['Jacksonville', 'Montreal']
['Jacksonville', 'Boston']
['Charlotte', 'Pittsburgh']
['Charlotte', 'Toronto']
['Charlotte', 'Philadelphia']
['Charlotte', 'Montreal']
['Charlotte', 'Boston']
['Pittsburgh', 'Toronto']
['Pittsburgh', 'Philadelphia']
['Pittsburgh', 'Montreal']
['Pittsburgh', 'Boston']
['Toronto', 'Philadelphia']
['Toronto', 'Montreal']
['Toronto', 'Boston']
['Philadelphia', 'Montreal']
['Philadelphia', 'Boston']


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
compare_linear(temperature["Seattle"], temperature["Portland"])

In [None]:
dtw(temperature["Seattle"], temperature["Portland"], dist=lambda x, y: abs(x - y))

In [None]:
accelerated_dtw(temperature["Seattle"][:1900].values, temperature["Portland"][:1900].values, dist="euclidean")

In [None]:
fastdtw(temperature["Seattle"], temperature["Portland"], radius=1)

In [None]:
691321.55 / 45253

from dtw import dtw

def simple_dtw(series_a, series_b, dist=lambda x, y: abs(x - y), warp_weight=1):
    # DTW requires equal-length lists
    if len(series_a) != len(series_b):
        raise IndexError("Lists must be of equal length")
    if warp_weight <= 0:
        raise ValueError("Warp weight must be positive")
    # Initialize Accumulated Cost Matrix
    distances = [[0 for _ in range(len(series_a))] for _ in range(len(series_b))]
    # Initialize first cell
    distances[0][0] = dist(series_a[0], series_b[0])
    # Initialize first column
    for i in range(1, len(series_b)):
        distances[i][0] = distances[i-1][0] * warp_weight + dist(series_a[i], series_b[0])
    # Initialize first row
    for j in range(1, len(series_a)):
        distances[0][j] = distances[0][j-1] * warp_weight + dist(series_a[0], series_b[j])
    # Construct the Accumulated Cost Matrix
    for i in range(1, len(series_a)):
        for j in range(1, len(series_b)):
            distances[i][j] = min(distances[i-1][j-1], distances[i-1][j] * warp_weight, distances[i][j-1] * warp_weight) + dist(series_a[i], series_b[j])
    return distances[-1][-1], distances
    

def compare_linear(series_a, series_b, dist=lambda x, y: abs(x - y)):
    # Linear comparison requires equal-length lists
    if len(series_a) != len(series_b):
        raise IndexError("Lists must be of equal length")
    diff = 0
    for i in range(0, len(series_a)):
        diff += dist(series_a[i], series_b[i])
    return diff

x = [1, 2, 8, 8, 3]
y = [1, 4, 4, 8, 4]

d, cost_matrix, acc_cost_matrix, path = dtw(x, y, dist=lambda x, y: abs(x - y))

s_d, s_acc_cost_matrix = simple_dtw(x, y, dist=lambda x, y: abs(x - y))


print("Point-by-point", compare_linear(x, y))
print(d)
print(acc_cost_matrix)
print(s_d)
for s in s_acc_cost_matrix:
    print(s)

import matplotlib.pyplot as plt

plt.imshow(acc_cost_matrix.T, origin='lower', cmap='gray', interpolation='nearest')
plt.plot(path[0], path[1], 'w')
plt.show()

import random

for _ in range(100):
    x = []
    y = []
    for _ in range(10):
        x.append(random.randint(0, 10))
        y.append(random.randint(0, 10))
    d, cost_matrix, acc_cost_matrix, path = dtw(x, y, dist=lambda x, y: abs(x - y), s=2)
    s_d, s_acc_cost_matrix = simple_dtw(x, y, dist=lambda x, y: abs(x - y), warp_weight=2)
    assert(d == s_d)

print("Passed")


In [2]:
p = sorted(city_pairs, key=lambda x: x[-1])
for pr in p:
    print(pr[0:2])

['Los Angeles', 'San Diego']
['Portland', 'Seattle']
['San Antonio', 'Houston']
['Atlanta', 'Charlotte']
['Detroit', 'Toronto']
['Chicago', 'Detroit']
['Detroit', 'Pittsburgh']
['Indianapolis', 'Pittsburgh']
['San Antonio', 'Dallas']
['Chicago', 'Indianapolis']
['Nashville', 'Atlanta']
['Saint Louis', 'Indianapolis']
['Pittsburgh', 'Philadelphia']
['Kansas City', 'Saint Louis']
['Philadelphia', 'Boston']
['Nashville', 'Charlotte']
['Indianapolis', 'Detroit']
['Dallas', 'Houston']
['Chicago', 'Toronto']
['Toronto', 'Montreal']
['Houston', 'Jacksonville']
['Detroit', 'Boston']
['Toronto', 'Boston']
['Chicago', 'Pittsburgh']
['Indianapolis', 'Philadelphia']
['Saint Louis', 'Nashville']
['Pittsburgh', 'Boston']
['Chicago', 'Boston']
['Pittsburgh', 'Toronto']
['Detroit', 'Philadelphia']
['San Antonio', 'Jacksonville']
['Chicago', 'Philadelphia']
['Saint Louis', 'Philadelphia']
['Minneapolis', 'Detroit']
['Kansas City', 'Indianapolis']
['Indianapolis', 'Boston']
['Detroit', 'Montreal']
['Min