In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pickle
from shapely import wkt
import os
import seaborn as sns
import matplotlib

# Font setting
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Arial"]  # Arial, Times New Roman

# Distribution of thresholds

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from matplotlib import scale

plt.rcParams["font.size"] = 12


def draw_hist(
    df,
    cbg_counts=[6288, 2509, 2183, 972, 1336, 845, 946, 583],
    year="",
    save=False,
    suffix="",
):

    fig, ax = plt.subplots(figsize=(10, 6))

    # Set a color palette
    palette = sns.color_palette()
    df.drop(df.columns[:3], axis=1, inplace=True)
    # df.boxplot(ax=ax1, boxprops=boxprops, medianprops=medianprops,whiskerprops=whiskerprops)
    count = 7
    for i in range(len(df.columns)):
        print(i, df.columns[i])
        col = df.columns[i]
        sns.kdeplot(
            df[col].to_list(), label=col, color=palette[count], ax=ax, bw_adjust=1
        )
        ax.axvline(
            cbg_counts[i],
            linestyle="dashed",
            color=palette[count],
            linewidth=1,
            label=f"{col} CBG count",
        )
        count -= 1
    ax.axvline(130, color="r", linestyle="dashed", linewidth=2, label=f"k=130")

    plt.xlim(5, 10500)
    plt.ylim(0, 0.016)
    ax.set_xscale("log")
    # plt.xticks(np.arange(0, 600, 100))

    # plt.title('Distribution of k across cities')
    ax.set_xlabel("Number of CBG", fontsize=13)
    ax.set_ylabel("Probability density function", fontsize=13)
    # pass handle & labels lists along with order as below
    plt.legend(loc="upper left", fontsize=11)
    plt.tight_layout()
    if save:
        plt.savefig("Distribution of k across cities_{}.png".format(suffix), dpi=500)
    plt.show()

In [None]:
k_result_path = r".\percolation_thresholds_US_2percent.csv"
df = pd.read_csv(k_result_path)
draw_hist(df, save=True, suffix=k_result_path.split("\\")[-1].split(".")[0])

# CCDF of degree distribution

In [None]:
import numpy as np
import networkx as nx
import geopandas as gpd
import pandas as pd
import networkx as nx


def extract_water_in_NY():
    cbgs = gpd.read_file(
        r"D:\Dropbox\urban cup\arcgis project\cbgs\cbgs_of_cities\NewYork\New_York_city.shp"
    )
    with open(r"D:\Dropbox\urban cup\data\Mobility\id_dict_1.pkl", "rb") as f:
        id_dict = pickle.load(f)

        # Extract the lines that not in cbgs but in id_dict
        id_dict_keys = list(id_dict.keys())
        cbgs_keys = list(cbgs["CBG_Code"].values)
        cbgs_in_water = []
        for i in id_dict_keys:
            if id_dict[id_dict_keys[i]] not in cbgs_keys:
                cbgs_in_water.append(i)
        return cbgs_in_water


def preprocess_flows(flow_matrix, percentile=2):
    """
    Preprocess the flow matrix by removing nodes with degree in the lowest 2% of the distribution
    :param flow_matrix: numpy array, the flow matrix
    :param percentile: int, the percentile threshold
    """
    # Set the diagonal elements to zero
    np.fill_diagonal(flow_matrix, 0)

    # Create directed graph from flow matrix
    G = nx.from_numpy_array(flow_matrix, create_using=nx.DiGraph)

    # Convert graph to numpy adjacency matrix
    adj_matrix = nx.adjacency_matrix(G)
    numpy_array = adj_matrix.toarray()
    # Convert the original flows to binary
    flows_binary = (numpy_array > 0).astype(int)
    # Calculate the degrees based on the new definition
    degrees_all = np.sum(np.logical_or(flows_binary, flows_binary.T), axis=1)

    # Determine the 2% degree threshold
    percentile_threshold = np.percentile(degrees_all, percentile)

    # find the nodes whose degree are in the lowest 2%
    nodes_to_remove = np.where(degrees_all <= percentile_threshold)[0]
    print("Number of nodes to remove: ", len(nodes_to_remove))

    # Remove the nodes from the matrix
    reduced_matrix = np.delete(numpy_array, nodes_to_remove, axis=0)
    reduced_matrix = np.delete(reduced_matrix, nodes_to_remove, axis=1)

    # Create directed graph from the reduced matrix
    reduced_G = nx.from_numpy_array(reduced_matrix, create_using=nx.DiGraph)

    # Find all strongly connected components in the reduced graph
    scc = list(nx.strongly_connected_components(reduced_G))
    # Create a subgraph with only nodes in the largest SCC
    strongly_connected_G = reduced_G.subgraph(max(scc, key=len))
    processed_flows = nx.to_numpy_array(strongly_connected_G)
    return processed_flows


# Example usage
# flows = np.load('path_to_flows.npy')
# processed_flows = preprocess_flows(flows)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import powerlaw
import pickle

plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Arial"]

cities = [
    "New York, NY",
    "Los Angeles, CA",
    "Chicago, IL",
    "Phoenix, AZ",
    "Philadelphia, PA",
    "San Diego, CA",
    "Dallas, TX",
    "San Jose, CA",
]

# Read threshold values
k_values = pd.read_csv(r".\k_select_US_2percent.csv")

cbgs_in_water = extract_water_in_NY()

# Randomly select a year and month
year = 2018
month = 5
# Crate a figure with 8 subplots
fig, axs = plt.subplots(4, 2, figsize=(8, 15))
axs = axs.flatten()  # Convert the 2D array of subplots to 1D for easy iteration
count = -1
for city_idx, city in enumerate(cities):

    print(city_idx + 1, city)

    # 根据年份和月份从CSV文件中获取k值
    k = k_values.loc[
        (k_values["year"] == year) & (k_values["month"] == month), city
    ].values[0]

    flows = np.load(
        r".\data\Mobility\cbg_visit_{year_}-{month_}_{city_}.npy".format(
            year_=year, month_=str(month).zfill(2), city_=city_idx + 1
        )
    )
    if city_idx == 0:
        flows = np.delete(flows, cbgs_in_water, axis=0)
        flows = np.delete(flows, cbgs_in_water, axis=1)
    count += 1
    flows = preprocess_flows(flows)

    # Construct the graph for top-k outflows
    top_k_flows = np.zeros_like(flows)
    top_2k_flows = np.zeros_like(flows)
    top_k2_flows = np.zeros_like(flows)
    for i in range(flows.shape[0]):
        non_zero_flows = np.count_nonzero(flows[i])
        k_flows = min(k, non_zero_flows)
        k2k_flows = min(2 * k, non_zero_flows)
        kk2_flows = min(k // 2, non_zero_flows)
        if k_flows == 0:
            continue
        top_k_indices = np.argpartition(flows[i], -k_flows)[-k_flows:]
        top_k_flows[i, top_k_indices] = 1
        top_2k_indices = np.argpartition(flows[i], -k2k_flows)[-k2k_flows:]
        top_2k_flows[i, top_2k_indices] = 1
        top_k2_indices = np.argpartition(flows[i], -kk2_flows)[-kk2_flows:]
        top_k2_flows[i, top_k2_indices] = 1

    # extract biggest connected component for top_k2_flows
    G = nx.from_numpy_array(top_k2_flows, create_using=nx.DiGraph)
    Gc = max(nx.strongly_connected_components(G), key=len)
    top_k2_flows = nx.to_numpy_array(G.subgraph(Gc))

    flows_binary = (flows > 0).astype(int)
    degrees_all = np.sum(flows_binary, axis=0)
    degrees_k = np.sum(top_k_flows, axis=0)
    degrees_2k = np.sum(top_2k_flows, axis=0)
    degrees_k2 = np.sum(top_k2_flows, axis=0)

    sorted_degrees = sorted(degrees_all, reverse=True)
    sorted_degrees_k = sorted(degrees_k, reverse=True)
    sorted_degrees_2k = sorted(degrees_2k, reverse=True)
    sorted_degrees_k2 = sorted(degrees_k2, reverse=True)

    # Calculate the CCDF
    ccdf = [(i / len(sorted_degrees)) for i in range(len(sorted_degrees))]
    ccdf_k = [(i / len(sorted_degrees_k)) for i in range(len(sorted_degrees_k))]
    ccdf_2k = [(i / len(sorted_degrees_2k)) for i in range(len(sorted_degrees_2k))]
    ccdf_k2 = [(i / len(sorted_degrees_k2)) for i in range(len(sorted_degrees_k2))]

    # Draw the subgraph
    ax = axs[count]
    colors = ["#eb7c77", "#f0a015", "#329291", "#2c4eaa"]
    ax.plot(
        sorted_degrees[::-1],
        ccdf[::-1],
        "o",
        color=colors[0],
        label="All Flows",
        markersize=3,
    )
    ax.plot(
        sorted_degrees_k[::-1],
        ccdf_k[::-1],
        "s",
        color=colors[1],
        label="Top-k Outflows",
        markersize=3,
    )
    ax.plot(
        sorted_degrees_2k[::-1],
        ccdf_2k[::-1],
        "^",
        color=colors[2],
        label="Top-2k Outflows",
        markersize=3,
    )
    ax.plot(
        sorted_degrees_k2[::-1],
        ccdf_k2[::-1],
        "o",
        color=colors[3],
        label="Top-k/2 Outflows",
        markersize=3,
    )
    # fit.power_law.plot_ccdf(ax=ax, color='black', linestyle='--', label='Power Law Fit')

    ax.set_xlabel("In-degree")
    ax.set_ylabel("Pc(In-degree)")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.tick_params(direction="in", length=6, width=0.5, colors="k", which="both")
    ax.set_title("{city}".format(city=city))
    ax.legend()

plt.tight_layout()
plt.savefig(
    r".\results\ccdf_in_degree_distribution_{year}_{month}_5.png".format(
        year=year, month=month
    ),
    dpi=300,
)
plt.show()

# Temporal trend of k*

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.lines import Line2D

# Plotting
cities = [
    "New York, NY",
    "Los Angeles, CA",
    "Chicago, IL",
    "Phoenix, AZ",
    "Philadelphia, PA",
    "San Diego, CA",
    "Dallas, TX",
    "San Jose, CA",
]
cbg_counts = [6493, 2509, 2183, 972, 1336, 845, 946, 583]
years = range(2018, 2022)
city_list = list(range(1, 9))
month_year_labels = []

palette = sns.color_palette()

import matplotlib.pyplot as plt

k_result_path = r".\k_select_US_newNY_5percent.csv"
df = pd.read_csv(k_result_path, index_col=0)
suffix = k_result_path.split("_")[-1].split(".")[0]
for year in years:
    for month in range(1, 13):
        # if year == 2022 and month > 3:
        #    break
        month_year_labels.append(
            "{year_}-{month_}".format(month_=str(month).zfill(2), year_=year)
        )

df.drop(["year", "month"], axis=1, inplace=True)
df["mean"] = df.mean(axis=1)

# Calculate the standard deviation
df["std"] = df.loc[:, df.columns[:-1]].std(axis=1)

# Create a figure and axis
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(1, 1, 1)

# Draw the lines for each city
for city in cities:
    if city == "Houston, TX" or city == "San Antonio, TX":
        continue
    ax.plot(
        df.loc[:, city].to_list(),
        label=city,
    )

# Set the title and labels
# ax.set_title('K across time')
ax.set_xlabel("Date")
ax.set_ylabel("K")

# Set the x-ticks and labels
# num_months = len(df.index.unique())
num_months = len(month_year_labels)

# 使用errorbar绘制平均值线及其误差
ax.errorbar(
    range(num_months),
    df["mean"].to_list(),
    yerr=df["std"].to_list(),
    label="Mean",
    color="black",
    linewidth=3,
    elinewidth=1,
    capsize=5,
    capthick=1,
)

ax.set_xticks(list(range(num_months))[::3])
print((month_year_labels))
ax.set_xticklabels(
    month_year_labels[::3],
    rotation=45,
)

ax.legend()
ax.set_ylim(0, 800)
plt.tight_layout()
plt.savefig("k_across_time_with_error_bar_{}.png".format(suffix), dpi=300)
plt.show()

## Depiction of power-law exponents across cities.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors, gridspec
import pandas as pd

df = pd.read_csv(r".\results\power_law_results_in_degree.csv")
sorted_cities = [
    "New York, NY",
    "Los Angeles, CA",
    "Chicago, IL",
    "Philadelphia, PA",
    "Phoenix, AZ",
    "Dallas, TX",
    "San Diego, CA",
    "San Jose, CA",
]
df["city"] = pd.Categorical(df["city"], categories=sorted_cities, ordered=True)
matrix = df.pivot(index="city", columns=["year", "month"], values="alpha")
print(matrix)
matrix = matrix.reindex(sorted_cities)
matrix_array = matrix.to_numpy()

# Create the figure
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Arial"]
plt.rcParams["font.size"] = 14

# Set TwoSlopeNorm with center=2.7
norm = colors.TwoSlopeNorm(vcenter=2.7)

fig = plt.figure(figsize=(15, 6))
gs = gridspec.GridSpec(1, 3, width_ratios=[5, 0.5, 0.1])  # Adjusting width ratios

# Create subplot for the heatmap
ax1 = fig.add_subplot(gs[0])

cbg_counts = [6493, 2509, 2183, 1336, 972, 946, 929, 845, 583]
reversed_cbg_counts = cbg_counts[::-1]

# Draw the heatmap using imshow
im = ax1.imshow(
    matrix_array, cmap="Spectral_r", aspect="auto", origin="lower", norm=norm
)

# Set y-axis tick labels to city names
ax1.set_yticks(np.arange(len(sorted_cities)))
sorted_cities = [i[: i.find(",")] for i in sorted_cities]
ax1.set_yticklabels(sorted_cities, rotation=45)

# Create subplot for the bar chart and share y-axis
ax2 = fig.add_subplot(gs[1])

# Draw a horizontal bar chart using ax2
ax2.barh(sorted_cities, reversed_cbg_counts, color="gray")
ax2.set_xlim([0, max(cbg_counts) * 1.1])  # Add some padding to the right
ax2.invert_yaxis()  # To match the y-axis with the heatmap
ax2.set_xlabel("CBG Counts")
ax2.set_xticks(
    ticks=[1000, 3000, 5000], labels=["1k", "3k", "5k"]
)  # Customize x tick labels for ax2
ax2.set_yticks([])  # Hide y tick labels for ax2, since they are shared

# Set x-axis tick labels
date_range = pd.date_range(start="2018-01-01", end="2021-12-01", freq="M")
x_ticks = np.arange(0, len(date_range))
x_labels = [date.strftime("%Y-%m") for date in date_range]
ax1.set_xticks(x_ticks[::3])
ax1.set_xticklabels(x_labels[::3], rotation=45)

# Add a color bar, place it on the far right, and make it narrower
ax_cb = fig.add_subplot(gs[2])
cbar = fig.colorbar(im, cax=ax_cb, label="Alpha Value")

# Display the figure
plt.tight_layout()
plt.savefig("powerlaw_exponents.png", dpi=300)
plt.show()