<a href="https://colab.research.google.com/github/LikeRainDay/colab-demo/blob/main/%E7%9F%A5%E8%AF%86%E5%9B%BE%E8%B0%B1/%E7%A4%BE%E6%81%90%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 介绍

本章内容主要介绍如何使用sellterGraph进行图相关的算法使用


In [None]:
# 准备 恐怖袭击的数据集
!wget https://gtd.terrorismdata.com/app/uploads/_mediavault/2019/09/globalterrorismdb_0919dist.xlsx


In [None]:
#准备所需要的依赖包导入

In [None]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1

import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator, GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import UnsupervisedSampler
from sklearn.model_selection import train_test_split

from tensorflow import keras

from stellargraph import globalvar


import pandas as pd
import numpy as np
import networkx as nx

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import random

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline

import warnings
warnings.filterwarnings('ignore') # 

读取数据集


In [None]:
dt_raw = pd.read_excel(
    "/content/globalterrorismdb_0919dist.xlsx"
)

In [None]:
dt_raw.head()

导入特征文件内容

In [None]:
from functools import reduce
def load_features(input_data):
    # Summarise features by terrorist group
    dt_collect = input_data[
        ["eventid", "nperps", "success", "suicide", "nkill", "nwound", "gname"]
    ]
    dt_collect.fillna(0, inplace=True)
    dt_collect.nperps[dt_collect.nperps < 0] = 0

    summarize_by_gname = (
        dt_collect.groupby("gname")
        .agg(
            {
                "eventid": "count",
                "nperps": "sum",
                "nkill": "sum",
                "nwound": "sum",
                "success": "sum",
            }
        )
        .reset_index()
    )
    summarize_by_gname.columns = [
        "gname",
        "n_attacks",
        "n_nperp",
        "n_nkil",
        "n_nwound",
        "n_success",
    ]
    summarize_by_gname["success_ratio"] = (
        summarize_by_gname["n_success"] / summarize_by_gname["n_attacks"]
    )
    summarize_by_gname.drop(["n_success"], axis=1, inplace=True)

    # Collect counts of each attack type
    dt_collect = input_data[["gname", "attacktype1_txt"]]
    gname_attacktypes = (
        dt_collect.groupby(["gname", "attacktype1_txt"])["attacktype1_txt"]
        .count()
        .to_frame()
    )
    gname_attacktypes.columns = ["attacktype_count"]
    gname_attacktypes.reset_index(inplace=True)
    gname_attacktypes_wide = gname_attacktypes.pivot(
        index="gname", columns="attacktype1_txt", values="attacktype_count"
    )
    gname_attacktypes_wide.fillna(0, inplace=True)
    gname_attacktypes_wide.drop(["Unknown"], axis=1, inplace=True)

    # Collect counts of each target type
    dt_collect = input_data[["gname", "targtype1_txt"]]
    gname_targtypes = (
        dt_collect.groupby(["gname", "targtype1_txt"])["targtype1_txt"]
        .count()
        .to_frame()
    )
    gname_targtypes.columns = ["targtype_count"]
    gname_targtypes.reset_index(inplace=True)
    gname_targtypes_wide = gname_targtypes.pivot(
        index="gname", columns="targtype1_txt", values="targtype_count"
    )
    gname_targtypes_wide.fillna(0, inplace=True)
    gname_targtypes_wide.drop(["Unknown"], axis=1, inplace=True)

    # Combine all features
    data_frames = [summarize_by_gname, gname_attacktypes_wide, gname_targtypes_wide]
    gnames_features = reduce(
        lambda left, right: pd.merge(left, right, on=["gname"], how="outer"),
        data_frames,
    )
    return gnames_features


gnames_features = load_features(input_data=dt_raw)
gnames_features.head()

In [None]:
# 将输入导入到selletGraph要求的数据格式内容

def load_network(input_data):
    # Create country_decade feature
    dt_collect = input_data[["eventid", "country_txt", "iyear", "gname"]]
    dt_collect["decade"] = (dt_collect["iyear"] // 10) * 10
    dt_collect["country_decade"] = (
        dt_collect["country_txt"] + "_" + dt_collect["decade"].map(str) + "s"
    )
    dt_collect = dt_collect[dt_collect.gname != "Unknown"]

    # Create a country_decade edgelist
    gnames_country_decade = (
        dt_collect.groupby(["gname", "country_decade"])
        .agg({"eventid": "count"})
        .reset_index()
    )
    gnames_country_decade_edgelist = pd.merge(
        gnames_country_decade, gnames_country_decade, on="country_decade", how="left"
    )
    gnames_country_decade_edgelist.drop(
        ["eventid_x", "eventid_y"], axis=1, inplace=True
    )
    gnames_country_decade_edgelist.columns = ["source", "country_decade", "target"]
    gnames_country_decade_edgelist = gnames_country_decade_edgelist[
        gnames_country_decade_edgelist.source != gnames_country_decade_edgelist.target
    ]

    G_country_decade = nx.from_pandas_edgelist(
        gnames_country_decade_edgelist, source="source", target="target"
    )

    # Create edgelist from the related column
    dt_collect = input_data["related"]
    dt_collect.dropna(inplace=True)
    gname_event_mapping = input_data[["eventid", "gname"]].drop_duplicates()
    gname_event_mapping.eventid = gname_event_mapping.eventid.astype(str)

    G_related = nx.parse_adjlist(
        dt_collect.values, delimiter=", "
    )  # attacks that are related
    df_related = nx.to_pandas_edgelist(G_related)
    df_related.replace(" ", "", regex=True, inplace=True)
    df_source_gname = pd.merge(
        df_related,
        gname_event_mapping,
        how="left",
        left_on="source",
        right_on="eventid",
    )
    df_source_gname.rename(columns={"gname": "gname_source"}, inplace=True)
    df_target_gname = pd.merge(
        df_source_gname,
        gname_event_mapping,
        how="left",
        left_on="target",
        right_on="eventid",
    )
    df_target_gname.rename(columns={"gname": "gname_target"}, inplace=True)

    # Filtering and cleaning
    gnames_relations_edgelist = df_target_gname[
        df_target_gname.gname_source != df_target_gname.gname_target
    ]
    gnames_relations_edgelist = gnames_relations_edgelist[
        gnames_relations_edgelist.gname_source != "Unknown"
    ]
    gnames_relations_edgelist = gnames_relations_edgelist[
        gnames_relations_edgelist.gname_target != "Unknown"
    ]
    gnames_relations_edgelist = gnames_relations_edgelist[
        ["gname_source", "gname_target"]
    ]
    gnames_relations_edgelist.dropna(inplace=True)

    G_rel = nx.from_pandas_edgelist(
        gnames_relations_edgelist, source="gname_source", target="gname_target"
    )

    # Merging two graphs
    G = nx.compose(G_country_decade, G_rel)

    return G



G = load_network(input_data=dt_raw)

In [None]:
print(nx.info(G))

In [None]:
# 联通数量

print(nx.number_connected_components(G))

In [None]:
subGraph = (G.subgraph(c) for c in nx.connected_components(G))
Gcc = sorted(subGraph, key=len, reverse=True)
cc_sizes = []
for cc in list(Gcc):
    cc_sizes.append(len(cc.nodes()))
print(cc_sizes)

In [None]:
filtered_features = gnames_features[gnames_features["gname"].isin(list(G.nodes()))]
filtered_features.set_index("gname", inplace=True)
filtered_features.shape

In [None]:
filtered_features.head()

In [None]:
node_features = filtered_features.transform(lambda x: np.log1p(x))

In [None]:
# 检查是否有拼写错误的内容遗留
set(list(G.nodes())) - set(list(node_features.index.values))

# Unsupervised graphSAGE 无监督graphSAGE 的功能实现内容

In [None]:
Gs = sg.StellarGraph.from_networkx(G, node_features=node_features)
print(Gs.info())

In [None]:
# 模型参数
number_of_walks = 3
length = 5
batch_size = 50
epochs = 10
num_samples = [20, 20]
layer_sizes = [100, 100]
learning_rate = 1e-2

In [None]:
# 非监督采样
unsupervisedSamples = UnsupervisedSampler(
    Gs, nodes=G.nodes(), length=length, number_of_walks=number_of_walks
)

In [None]:
generator = GraphSAGELinkGenerator(Gs, batch_size, num_samples)
train_gen = generator.flow(unsupervisedSamples)

In [None]:
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2"
)

In [None]:
# 进行连接预测内容
x_inp, x_out = graphsage.in_out_tensors()

prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

In [None]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=learning_rate),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
)


In [None]:
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=2,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
)

In [None]:
node_ids = list(Gs.nodes())
node_gen = GraphSAGENodeGenerator(Gs, batch_size, num_samples).flow(node_ids)

In [None]:
embedding_model = keras.Model(inputs=x_inp[::2], outputs=x_out[0])
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)


In [None]:
# 可视化映射内容
node_embeddings.shape

In [None]:
X = node_embeddings
if X.shape[1] > 2:
    transform = TSNE  # PCA

    trans = transform(n_components=2, random_state=123)
    emb_transformed = pd.DataFrame(trans.fit_transform(X), index=node_ids)
else:
    emb_transformed = pd.DataFrame(X, index=node_ids)
    emb_transformed = emb_transformed.rename(columns={"0": 0, "1": 1})

alpha = 0.7

fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(emb_transformed[0], emb_transformed[1], alpha=alpha)
ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
plt.title("{} visualization of GraphSAGE embeddings".format(transform.__name__))
plt.show()

In [None]:
emb_transformed["infomap_clusters"] = emb_transformed.index.map(infomap_com_dict)
plt.scatter(
    emb_transformed[0],
    emb_transformed[1],
    c=emb_transformed["infomap_clusters"],
    cmap="Spectral",
    edgecolors="black",
    alpha=0.3,
    s=100,
)
plt.title("t-sne with colors corresponding to infomap communities")

In [None]:
db_dt = utils.dbscan_hyperparameters(
    node_embeddings, e_lower=0.1, e_upper=0.9, m_lower=5, m_upper=15
)

In [None]:
db_dt.sort_values(by=["n_noise"])[db_dt.n_clusters > 1]

In [None]:
db = DBSCAN(eps=0.1, min_samples=5).fit(node_embeddings)

In [None]:
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(node_embeddings, labels))

In [None]:
emb_transformed["dbacan_clusters"] = labels
X = emb_transformed[emb_transformed["dbacan_clusters"] != -1]


plt.scatter(
    X[0],
    X[1],
    c=X["dbacan_clusters"],
    cmap="Spectral",
    edgecolors="black",
    alpha=0.3,
    s=100,
)
plt.title("t-sne with colors corresponding to dbscan cluster. Without noise points")