In [1]:
import pandas as pd
import plotly.graph_objects as go

In [11]:
data = pd.DataFrame({
    "temperature":[0.1,0.5,1.0]*4,
    "scores":[0.305, 0.384, 0.382]+[0.327, 0.390, 0.380]+[0.328, 0.395, 0.370]+[0.309, 0.398, 0.373],
    "name":["base"]*3 + ["batch_size=128"]*3 + ["batch_size=128 & <br>layers=(256,128)"]*3 + ["batch_size=512 & <br>layers=(256,128)"]*3
}).sort_values("temperature")

In [12]:
fig = go.Figure(data=[go.Scatter(
    x=data[data["name"]==name]["temperature"],
    y=data[data["name"]==name]["scores"],
    mode='markers+lines',
    name=name,
    marker=dict(size=10)
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Temperature",
    yaxis_title="F1 Score (macro avg)",
    title="Temperature Parameter Tuning",
    legend_title="Parameter Set",
    width=800,
    height=600
)
fig.show()

In [17]:
data = pd.DataFrame({
    "batch_size":[64,128,256,512,1024]*2,
    "scores":[0.385, 0.390, 0.384, 0.383, 0.377]+[0.392, 0.395, 0.395, 0.398, 0.384],
    "name":["temp=0.5"]*5+["temp=0.5 &<br>layers=(256,128)"]*5
}).sort_values("batch_size")

In [18]:
fig = go.Figure(data=[go.Scatter(
    x=data[data["name"]==name]["batch_size"],
    y=data[data["name"]==name]["scores"],
    mode='markers+lines',
    name=name,
    marker=dict(size=10)
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Batch Size",
    yaxis_title="F1 Score (macro avg)",
    title="Batch Size Tuning",
    legend_title="Parameter Set",
    width=800,
    height=600,
    xaxis=dict(tickmode = 'array',tickvals = [64,128,256,512,1024])
)
fig.update_xaxes(type="log")
fig.show()

In [20]:
data = pd.DataFrame({
    "emb_dim":[16,32,64,128,256]*2,
    "scores":[0.375, 0.362, 0.390, 0.381, 0.376]+[0.352, 0.376, 0.398, 0.397, 0.384],
    "name":["temp=0.5 &<br>batch_size=128"]*5+["temp=0.5 &<br>batch_size=512 &<br>layers=(256,128)"]*5
}).sort_values("emb_dim")

In [21]:
fig = go.Figure(data=[go.Scatter(
    x=data[data["name"]==name]["emb_dim"],
    y=data[data["name"]==name]["scores"],
    mode='markers+lines',
    name=name,
    marker=dict(size=10)
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Embedding Dimensionality",
    yaxis_title="F1 Score (macro avg)",
    title="Tuning of Embedding Dimensionality",
    legend_title="Parameter Set",
    width=800,
    height=600,
    xaxis=dict(tickmode = 'array',tickvals = [16,32,64,128,256,512,1024])
)
fig.update_xaxes(type="log")
fig.show()

In [24]:
data = pd.DataFrame({
    "hidden":["(512, 256)","(256, 256)","(256, 128)","(128, 128)","(128, 64)","(256, 256, 256)","(128, 128, 128)","(256, 128, 64)"]*2,
    "scores":[0.390, 0.388, 0.395, 0.390, 0.389, 0.351, 0.379, 0.366]+[0.387, 0.379, 0.398, 0.383, 0.397, 0.381, 0.380, 0.382],
    "name":["temp=0.5 &<br>batch_size=128"]*8+["temp=0.5 &<br>batch_size=512"]*8
})

In [30]:
fig = go.Figure(data=[go.Bar(
    x=data[data["name"]==name]["hidden"],
    y=data[data["name"]==name]["scores"],
    text=data[data["name"]==name]["scores"],
    name=name,
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Hidden Layers",
    yaxis_title="F1 Score (macro avg)",
    title="Encoder Architecture Tuning",
    legend_title="Parameter Set",
    width=800,
    height=600,
    showlegend=True,
    yaxis_range=[0.34,0.4]
)
fig.show()

In [31]:
data = pd.DataFrame({
    "lr":[1e-2,1e-3,1e-4],
    "scores":[0.354,0.398,0.283],
    "name":["temp=0.5 &<br>batch_size=512 &<br>layers=(256,128)"]*3
}).sort_values("lr")

In [32]:
fig = go.Figure(data=[go.Scatter(
    x=data[data["name"]==name]["lr"],
    y=data[data["name"]==name]["scores"],
    mode='markers+lines',
    name=name,
    marker=dict(size=10)
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Learning Rate",
    yaxis_title="F1 Score (macro avg)",
    title="Learning Rate Tuning",
    legend_title="Parameter Set",
    width=800,
    height=600,
    showlegend=True,
    xaxis=dict(tickmode = 'array',tickvals = [1e-2,1e-3,1e-4], tickformat="1.1e")
)
fig.update_xaxes(type="log")
fig.show()

In [77]:
data = pd.DataFrame({
    "hidden":["12 categories, <br>500m squares", "12 categories, <br>1000m squares", "427 categories, <br>500m squares", "427 categories, <br>1000m squares", "427 categories, <br>1000m squares, <br>Worldwide cities"]*2,
    "scores":[0.3539,0.3459,0.3588,0.3815,0.4130]+[0.3512,0.3502,0.3447,0.3707,0.3840],
    "name":["No check-in data"]*5+["With check-in data"]*5
})

In [78]:
fig = go.Figure(data=[go.Bar(
    x=data[data["name"]==name]["hidden"],
    y=data[data["name"]==name]["scores"],
    text=data[data["name"]==name]["scores"],
    textangle=90,
    name=name,
) for name in data["name"].unique()])
fig.update_layout(
    xaxis_title="Dataset",
    yaxis_title="F1 Score (macro avg)",
    title="Testing different datasets",
    # legend_title="Parameter Set",
    width=800,
    height=600,
    showlegend=True,
    yaxis_range=[0.3,0.42]
)
fig.show()

In [189]:
import numpy as np
np.random.seed(0)
import torch
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)
import pandas as pd
from geopy import distance

from sklearn.manifold import TSNE
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics.pairwise import cosine_similarity

from contrastive_learning import Encoder, GridDataset

In [190]:
categories = "lvl2"
square_size = 1000
neighbors = False
check_ins = False
cities = "WORLD"


encoder = Encoder(405 if check_ins else 404,64,(256,128))
encoder.load_state_dict(torch.load(rf"models\encoder_{categories}_cats_{square_size}m{'_neighbors' if neighbors else ''}{'_checkins' if check_ins else ''}{'_'+cities if cities != 'DEFAULT' else ''}.pth"))

org_data = pd.read_csv(
        f"data/squares_{categories}_cats_{square_size}m{'_neighbors' if neighbors else ''}{'_checkins' if check_ins else ''}{'_'+cities if cities != 'DEFAULT' else ''}.csv"
    )
org_data = org_data.loc[:, org_data.columns[6:]]

cities = "DEFAULT"
data = pd.read_csv(
        f"data/squares_{categories}_cats_{square_size}m{'_neighbors' if neighbors else ''}{'_checkins' if check_ins else ''}{'_'+cities if cities != 'DEFAULT' else ''}.csv"
    )
data = data.loc[:, data.columns[6:]]

data = data.assign(**{col: 0 for col in org_data.columns if col not in data.columns})

batch_size = 512

dataset = GridDataset(data)
full_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
train_data, val_data, test_data = torch.utils.data.random_split(
    dataset, [0.7, 0.15, 0.15]
)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



In [191]:
with torch.no_grad():
    embeddings = torch.tensor([])
    labels = []
    for batch_data, batch_labels in test_loader:
        embeddings = torch.cat((embeddings, encoder(batch_data)))
        labels += batch_labels
    labels = torch.tensor(labels)

# SVM classifier
svm_classifier = LinearSVC(class_weight="balanced")
svm_classifier.fit(embeddings.detach().numpy(), labels.numpy())
predictions = svm_classifier.predict(embeddings.detach().numpy())

print(classification_report(labels.numpy(), predictions))
print(f1_score(labels.numpy(), predictions, average="macro"))

              precision    recall  f1-score   support

           0       0.53      0.62      0.57       454
           1       0.33      0.18      0.23       151
           2       0.59      0.46      0.52       454
           3       0.26      0.46      0.33       115

    accuracy                           0.49      1174
   macro avg       0.43      0.43      0.41      1174
weighted avg       0.50      0.49      0.48      1174

0.412953234031247


In [192]:
emb_df = pd.DataFrame(embeddings)
emb_df["label"] = labels
avg_emb = emb_df.groupby("label").mean()

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(avg_emb)

In [193]:
label_mapping = list(full_loader.dataset.label_mapping.keys())
print(
    pd.DataFrame(similarity_matrix, index=label_mapping, columns=label_mapping)
    .to_latex(float_format="%.3f")
)

\begin{tabular}{lrrrr}
\toprule
 & Beijing & Nanjing & Shanghai & Xi’an \\
\midrule
Beijing & 1.000 & 0.689 & 0.861 & 0.576 \\
Nanjing & 0.689 & 1.000 & 0.673 & 0.458 \\
Shanghai & 0.861 & 0.673 & 1.000 & 0.663 \\
Xi’an & 0.576 & 0.458 & 0.663 & 1.000 \\
\bottomrule
\end{tabular}



In [194]:

# Create a heatmap
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    x=list(label_mapping),
    y=list(label_mapping),
    colorscale='reds',
))
fig.update_layout(
    title="Cosine Similarity between categories",
    width=800,
    height=800,
    yaxis=dict(autorange='reversed')
)
fig.show()

In [195]:
city_df = pd.read_csv("worldcities.csv")

In [196]:
distances = pd.DataFrame([
        [
            distance.distance(city_df[city_df["city"]==a].loc[:,["lat", "lng"]].values,city_df[city_df["city"]==b].loc[:,["lat", "lng"]].values).km
            for b in label_mapping
        ] 
        for a in label_mapping
    ],
    index=label_mapping,
    columns=label_mapping
)

similarities = pd.DataFrame(similarity_matrix, index=label_mapping, columns=label_mapping)

pairs = distances.reset_index().melt(id_vars="index")
pairs = pairs.rename(columns={"index":"city1","variable":"city2","value":"distance"})
pairs["similarity"] = similarities.reset_index().melt(id_vars="index")["value"]
pairs[pairs["city1"]!=pairs["city2"]].corr(numeric_only=True)

Unnamed: 0,distance,similarity
distance,1.0,0.067926
similarity,0.067926,1.0


In [198]:
print(
    distances
    .to_latex(float_format="%.1f")
)

\begin{tabular}{lrrrr}
\toprule
 & Beijing & Nanjing & Shanghai & Xi’an \\
\midrule
Beijing & 0.0 & 896.6 & 1066.7 & 911.5 \\
Nanjing & 896.6 & 0.0 & 271.8 & 949.0 \\
Shanghai & 1066.7 & 271.8 & 0.0 & 1220.8 \\
Xi’an & 911.5 & 949.0 & 1220.8 & 0.0 \\
\bottomrule
\end{tabular}



In [200]:
np.random.seed(0)
torch.manual_seed(0)

cities = "WORLD"
data = pd.read_csv(
        f"data/squares_{categories}_cats_{square_size}m{'_neighbors' if neighbors else ''}{'_checkins' if check_ins else ''}{'_'+cities if cities != 'DEFAULT' else ''}.csv"
    )
data = data.loc[:, data.columns[6:]]

dataset = GridDataset(data)
full_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
train_data, val_data, test_data = torch.utils.data.random_split(
    dataset, [0.7, 0.15, 0.15]
)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [201]:
with torch.no_grad():
    embeddings = torch.tensor([])
    labels = []
    for batch_data, batch_labels in test_loader:
        embeddings = torch.cat((embeddings, encoder(batch_data)))
        labels += batch_labels
    labels = torch.tensor(labels)
label_mapping = full_loader.dataset.label_mapping.keys()

In [None]:
emb_df = pd.DataFrame(embeddings)
emb_df["label"] = labels
avg_emb = emb_df.groupby("label").mean()

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(avg_emb)

# Create a heatmap
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    x=list(label_mapping),
    y=list(label_mapping),
    colorscale='balance',
    zmin=-1,
    zmax=1
))
fig.update_layout(
    title="Cosine Similarity between categories",
    width=1200,
    height=1200,
    yaxis=dict(autorange='reversed')
)
fig.write_html("similarities.html")
fig.show()

In [203]:
cities_country = [f'{city} ({city_df[city_df["city"]==city].iloc[0]["iso3"]})' for city in list(label_mapping)]
similarities = pd.DataFrame(
    similarity_matrix-np.eye(*similarity_matrix.shape), 
    index=cities_country,
    columns=cities_country
)

In [204]:
print(
    similarities
    .reset_index()
    .melt(id_vars="index")
    .sort_values("value", ascending=False)
    .head(10)
    .iloc[range(0,10,2),:]
    .to_latex(index=False, float_format="%.3f")
)

\begin{tabular}{llr}
\toprule
index & variable & value \\
\midrule
Jakarta (IDN) & Surabaya (IDN) & 0.953 \\
Nagoya (JPN) & Ōsaka (JPN) & 0.936 \\
Delhi (IND) & Bangalore (IND) & 0.900 \\
Ōsaka (JPN) & Tokyo (JPN) & 0.897 \\
Guangzhou (CHN) & Shenzhen (CHN) & 0.877 \\
\bottomrule
\end{tabular}



In [205]:
print(
    similarities
    .reset_index()
    .melt(id_vars="index")
    .sort_values("value", ascending=False)
    .tail(10)
    .iloc[range(0,10,2),:]
    .to_latex(index=False, float_format="%.3f")
)

\begin{tabular}{llr}
\toprule
index & variable & value \\
\midrule
Chattogram (BGD) & Madrid (ESP) & -0.333 \\
Bangkok (THA) & Bangalore (IND) & -0.357 \\
Bangkok (THA) & Delhi (IND) & -0.357 \\
Seoul (KOR) & Lahore (PAK) & -0.389 \\
Mumbai (IND) & Bangkok (THA) & -0.390 \\
\bottomrule
\end{tabular}



In [206]:
distances = pd.DataFrame([
        [
            distance.distance(city_df[city_df["city"]==a].loc[:,["lat", "lng"]].values[0],city_df[city_df["city"]==b].loc[:,["lat", "lng"]].values[0]).km
            for b in label_mapping
        ] 
        for a in label_mapping
    ],
    index=label_mapping,
    columns=label_mapping
)
similarities = pd.DataFrame(similarity_matrix, index=label_mapping, columns=label_mapping)

pairs = distances.reset_index().melt(id_vars="index")
pairs = pairs.rename(columns={"index":"city1","variable":"city2","value":"distance"})
pairs["similarity"] = similarities.reset_index().melt(id_vars="index")["value"]
pairs[pairs["city1"]!=pairs["city2"]].corr(numeric_only=True)

Unnamed: 0,distance,similarity
distance,1.0,-0.255391
similarity,-0.255391,1.0
