In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from collections import defaultdict
import random
import scipy.sparse as sp

In [2]:
# Define the path to the data
PATH = '/afs/csail.mit.edu/u/l/leihuang/project/HeCo/data/huya/'

Two ways to obtain the feature:
- convert_to_idx: 1D feature (if there are too many different categories like userip)
- one hot: 2D feature

In [3]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False)

In [4]:
# convert the continuous features to discrete features
def convert_to_index(a):
    a_to_index = {}
    index = 0
    result = []

    for e in a:
        if e not in a_to_index:
            a_to_index[e] = index
            index += 1
        result.append(a_to_index[e])

    return a_to_index

In [5]:
# load the data
df = pd.read_csv('/afs/csail.mit.edu/u/l/leihuang/project/HeCo/data/huya/注册数据 - 5k.csv', encoding='latin-1')

In [6]:
print(df.columns)

Index(['num', 'userip', 'mobile', 'link_all_userip_province',
       'link_all_userip_city', 'appver', 'link_all_mobile_province',
       'link_all_mobile_city', 'link_all_mobile_operators', 'terminal',
       'model', 'os_version', 'screenheight', 'screenwidth', 'memory',
       'label_7'],
      dtype='object')


## Obtain the feature

In [7]:
link_all_userip_province = df['link_all_userip_province']
province_feature  = label_encoder.fit_transform(link_all_userip_province)
province_feature = province_feature.reshape(-1, 1)
province_feature = onehot_encoder.fit_transform(province_feature)
print(province_feature)
print(province_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5558, 31)


In [8]:
link_all_userip_city = df['link_all_userip_city']
city_feature  = label_encoder.fit_transform(link_all_userip_city)
city_feature = city_feature.reshape(-1, 1)
city_feature = onehot_encoder.fit_transform(city_feature)
print(city_feature)
print(city_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5558, 339)


In [9]:
appver = df['appver']
appver_feature  = label_encoder.fit_transform(appver)
appver_feature = appver_feature.reshape(-1, 1)
appver_feature = onehot_encoder.fit_transform(appver_feature)
print(appver_feature)
print(appver_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5558, 100)


In [10]:
link_all_mobile_operators = df['link_all_mobile_operators']
mobile_operators_feature  = label_encoder.fit_transform(link_all_mobile_operators)
mobile_operators_feature = mobile_operators_feature.reshape(-1, 1)
mobile_operators_feature = onehot_encoder.fit_transform(mobile_operators_feature)
print(mobile_operators_feature)
print(mobile_operators_feature.shape)

[[1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]
(5558, 6)


In [11]:
os_version = df['os_version']
os_version_feature  = label_encoder.fit_transform(os_version)
os_version_feature = os_version_feature.reshape(-1, 1)
os_version_feature = onehot_encoder.fit_transform(os_version_feature)
print(os_version_feature)
print(os_version_feature.shape)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
(5558, 95)


In [8]:
ip = df['userip'].unique()
print(len(ip))
ip_dict = convert_to_index(ip)
print(ip_dict)

5479
{'103.8.207.203': 0, '113.243.179.21': 1, '222.140.140.82': 2, '36.5.76.87': 3, '113.78.160.221': 4, '183.227.14.71': 5, '223.104.115.210': 6, '111.29.220.196': 7, '123.147.246.181': 8, '125.85.233.130': 9, '1.182.249.116': 10, '183.208.83.149': 11, '123.183.228.34': 12, '120.245.24.217': 13, '222.173.220.233': 14, '117.171.173.104': 15, '49.118.139.185': 16, '171.90.26.18': 17, '117.140.183.101': 18, '39.144.210.91': 19, '113.83.68.224': 20, '42.91.167.121': 21, '114.246.97.179': 22, '58.100.4.236': 23, '1.204.96.203': 24, '120.243.36.204': 25, '180.162.213.181': 26, '183.198.214.72': 27, '113.13.42.125': 28, '119.123.56.239': 29, '123.101.234.119': 30, '42.92.136.175': 31, '111.122.152.19': 32, '183.46.201.34': 33, '120.231.42.16': 34, '139.204.37.51': 35, '113.8.85.253': 36, '60.14.130.125': 37, '112.37.149.144': 38, '103.242.215.40': 39, '222.189.94.131': 40, '221.225.135.217': 41, '182.118.236.47': 42, '113.58.148.53': 43, '171.94.222.56': 44, '39.144.134.242': 45, '114.138.2

In [9]:
terminal = df['terminal'].unique()
print(len(terminal))
terminal_dict = convert_to_index(terminal)
print(terminal_dict)

2
{'android': 0, 'ios': 1}


In [7]:
# features like this are too many, we need to just use the index to represent them
mobile = df['mobile'].unique()
print(len(mobile))
mobile_dict = convert_to_index(mobile)
print(mobile_dict)

5254
{'0861816762****': 0, '0861552620****': 1, '0861873861****': 2, '0861569566****': 3, '0861767327****': 4, '0861345296****': 5, '0861333932****': 6, '0861528978****': 7, '0861911554****': 8, '0861911278****': 9, '0861394748****': 10, '0861980502****': 11, '0861813203****': 12, '0861516735****': 13, '0861955350****': 14, '0861837032****': 15, '0861869013****': 16, '0861919896****': 17, '0861517751****': 18, '0861539365****': 19, '0861816992****': 20, '0861537902****': 21, '0861368335****': 22, '0861826716****': 23, '0861778541****': 24, '0861994463****': 25, '0861385600****': 26, '0861570313****': 27, '0861911142****': 28, '0861768527****': 29, '0861361987****': 30, '0861891911****': 31, '0861805180****': 32, '0861732593****': 33, '0861986002****': 34, '0861914061****': 35, '0861830456****': 36, '0861380367****': 37, '0861379136****': 38, '0861788528****': 39, '0861776631****': 40, '0861891358****': 41, '0861346274****': 42, '0861303498****': 43, '0861570068****': 44, '0861587872***

#### Obtain the label

In [39]:
label = df['label_7'].tolist()
print(len(label))
label = np.array(label, dtype=np.int32)
np.save(PATH+'labels.npy', label)

5558


### Construct the neighbor

Please edit the name of the neighbour dic, for example, the neighbour of ip should be nei_ip

In [16]:
### ip node nighbour
nei_ip = []

### terminal node neighbour
nei_terminal = []

In [10]:
def get_nei(node, df, node_dict):
    nei = []
    for index, row in df.iterrows():
        nei.append(node_dict[row[node]])
    return nei


In [None]:
nei_ip = get_nei('userip', df, ip_dict)
nei_terminal = get_nei('terminal', df, terminal_dict)

In [11]:
nei_mobile = get_nei('mobile', df, mobile_dict) 
np.save(PATH+'nei_mobile.npy', nei_mobile)


In [17]:
# for index, row in df.iterrows():
#     nei_ip.append(np.array([ip_dict[row['userip']]]))
#     nei_terminal.append(np.array([terminal_dict[row['terminal']]]))


In [19]:
print(nei_ip[:10])
np.save(PATH+'nei_i.npy', nei_ip)
np.save(PATH+'nei_t.npy', nei_terminal)

[array([0]), array([1]), array([2]), array([3]), array([4]), array([5]), array([6]), array([7]), array([8]), array([9])]


In [20]:
nei_ip = np.load(PATH+'nei_i.npy')
print(nei_ip[:10])

[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]


In [49]:
feat_account = np.concatenate((province_feature, city_feature, appver_feature, mobile_operators_feature, os_version_feature), axis=1)
print(feat_account.shape) 

saprse_feat_account = sp.csr_matrix(feat_account)
sp.save_npz(PATH+'feat_a.npz', saprse_feat_account)

(5558, 571)


In [18]:
import socket
import struct

def ip_to_int(ip):
    return struct.unpack("!I", socket.inet_aton(ip))[0]


# 转换为整数,可以选择这种方式，相当于直接使用它的value，也可以选择前面写的convert_to_index函数直接使用或者搭配上onehotencoder
ip_int_list = [ip_to_int(i) for i in ip]

print(ip_int_list)

feat_ip = np.array(ip_int_list).reshape(-1, 1)



[1728630731, 1911796501, 3733752914, 604326999, 1900978397, 3085110855, 3748164562, 1864228036, 2073294517, 2102782338, 28768628, 3083883413, 2075649058, 2029328601, 3735936233, 1974185320, 829852601, 2874808850, 1972156261, 663802459, 1901282528, 710649721, 1928749491, 979633388, 30171339, 2029200588, 3030570421, 3083261512, 1896688253, 2004564207, 2070276727, 710707375, 1870305299, 3073296674, 2028415504, 2345411891, 1896371709, 1007583869, 1881511312, 1743968040, 3736952451, 3722545113, 3061247023, 1899664437, 2875121208, 663783154, 1921703895, 3659480905, 3748166420, 1909899395, 719614440, 1863305778, 2075747909, 1743968189, 1881983892, 1851048133, 1975374765, 2111008537, 1710838701, 1919498591, 3670644780, 3748184852, 1882437422, 3748177318, 2028150838, 1971859087, 1879687919, 826684590, 3748150483, 3722319226, 1993730088, 462770940, 3748042359, 1866233538, 2028312490, 1864192124, 3662086005, 1946818201, 2871592987, 1904258422, 3085035765, 465460378, 1881779315, 3684175825, 465248

In [19]:
feat_terminal = label_encoder.fit_transform(nei_terminal)
feat_terminal = feat_terminal.reshape(-1, 1)
feat_terminal = onehot_encoder.fit_transform(feat_terminal)
print(feat_terminal)
print(feat_terminal.shape)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
(5558, 2)


In [14]:
feat_mobile = np.array(nei_mobile).reshape(-1, 1)
print(feat_mobile.shape)

(5558, 1)


In [41]:
# np.save(PATH+'feat_i.npy', feat_ip)
# np.save(PATH+'feat_t.npy', feat_terminal)

In [15]:
accounts = df['num'].unique()
account_index = {account: idx for idx, account in enumerate(accounts)}
terminal = df['terminal'].unique()
terminal_index = {terminal: idx for idx, terminal in enumerate(terminal)}
aia = np.zeros((len(accounts), len(accounts)), dtype=int) #account-ip-account meta path
ata = np.zeros((len(accounts), len(accounts)), dtype=int) #account-terminal-account meta path
ama = np.zeros((len(accounts), len(accounts)), dtype=int) #account-mobile-account meta path
ip_to_accounts = defaultdict(list)
terminal_to_accounts = defaultdict(list)
mobile_to_accounts = defaultdict(list)
for _, row in df.iterrows():
    ip_to_accounts[row['userip']].append(row['num'])
    terminal_to_accounts[row['terminal']].append(row['num'])
    mobile_to_accounts[row['mobile']].append(row['num'])

for accounts in ip_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            aia[idx_i][idx_j] += 1
            aia[idx_j][idx_i] += 1

for accounts in terminal_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            ata[idx_i][idx_j] += 1
            ata[idx_j][idx_i] += 1

for accounts in mobile_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            ama[idx_i][idx_j] += 1
            ama[idx_j][idx_i] += 1

print(aia.shape)
print(ata.shape)
print(ama.shape)

(5558, 5558)
(5558, 5558)
(5558, 5558)


In [16]:
sparse_aia = sp.csr_matrix(aia)
sparse_ata = sp.csr_matrix(ata)
sparse_ama = sp.csr_matrix(ama)
sp.save_npz(PATH+'aia.npz', sparse_aia)
sp.save_npz(PATH+'ata.npz', sparse_ata)
sp.save_npz(PATH+'ama.npz', sparse_ama)

!!!! 后面的要相应替换一下，把不用的注释掉,比如我下面这里注释掉了ata

In [17]:
def get_meta_path_neighbors(df, column):
    # 创建一个字典，键为中间节点（IP或Terminal），值为访问该节点的账户列表
    node_to_accounts = defaultdict(list)
    for _, row in df.iterrows():
        node_to_accounts[row[column]].append(row['num'])

    # 创建一个字典，键为账户，值为通过中间节点连接的其他账户及其出现次数
    account_meta_path = defaultdict(lambda: defaultdict(int))
    for accounts in node_to_accounts.values():
        for i in range(len(accounts)):
            for j in range(len(accounts)):
                if i != j:
                    account_meta_path[accounts[i]][accounts[j]] += 1

    return account_meta_path

# 获取 account-ip-account 的邻居统计
aia_neighbors = get_meta_path_neighbors(df, 'userip')

# # 获取 account-terminal-account 的邻居统计
# ata_neighbors = get_meta_path_neighbors(df, 'terminal')

# # 获取 account-mobile-account 的邻居统计
ama_neighbors = get_meta_path_neighbors(df, 'mobile')

# 打印结果
print("Account-IP-Account Neighbors:")
for account, neighbors in aia_neighbors.items():
    print(f"Account {account}: {dict(neighbors)}")

# print("\nAccount-Terminal-Account Neighbors:")
# for account, neighbors in ata_neighbors.items():
#     print(f"Account {account}: {dict(neighbors)}")

print("\nAccount-Mobile-Account Neighbors:")
for account, neighbors in ama_neighbors.items():
    print(f"Account {account}: {dict(neighbors)}")

Account-IP-Account Neighbors:
Account 37: {2170: 1}
Account 2170: {37: 1}
Account 188: {5298: 1}
Account 5298: {188: 1}
Account 246: {1085: 1}
Account 1085: {246: 1}
Account 262: {1805: 1}
Account 1805: {262: 1}
Account 308: {3535: 1}
Account 3535: {308: 1}
Account 317: {4588: 1}
Account 4588: {317: 1}
Account 319: {2888: 1}
Account 2888: {319: 1}
Account 347: {4840: 1}
Account 4840: {347: 1}
Account 359: {3771: 1, 5049: 1}
Account 3771: {359: 1, 5049: 1}
Account 5049: {359: 1, 3771: 1}
Account 438: {1532: 1}
Account 1532: {438: 1}
Account 467: {1191: 1, 5524: 1}
Account 1191: {467: 1, 5524: 1}
Account 5524: {467: 1, 1191: 1}
Account 586: {4833: 1}
Account 4833: {586: 1}
Account 605: {619: 1, 1554: 1}
Account 619: {605: 1, 1554: 1}
Account 1554: {605: 1, 619: 1}
Account 607: {1512: 1, 2223: 1, 2261: 1, 3125: 1, 3894: 1, 4693: 1}
Account 1512: {607: 1, 2223: 1, 2261: 1, 3125: 1, 3894: 1, 4693: 1}
Account 2223: {607: 1, 1512: 1, 2261: 1, 3125: 1, 3894: 1, 4693: 1}
Account 2261: {607: 1, 

In [18]:
combined_neighbors = defaultdict(lambda: defaultdict(int))

for account, neighbors in aia_neighbors.items():
    for neighbor, count in neighbors.items():
        combined_neighbors[account][neighbor] += count

# for account, neighbors in ata_neighbors.items():
#     for neighbor, count in neighbors.items():
#         combined_neighbors[account][neighbor] += count

for account, neighbors in ama_neighbors.items():
    for neighbor, count in neighbors.items():
        combined_neighbors[account][neighbor] += count

# 将结果转换为列表并排序
sorted_neighbors = {}
for account, neighbors in combined_neighbors.items():
    sorted_neighbors[account] = sorted(neighbors.items(), key=lambda item: item[1], reverse=True)

# 打印结果
print("Combined Meta Path Neighbors (sorted by count):")
for account, neighbors in sorted_neighbors.items():
    print(f"Account {account}: {neighbors}")

Combined Meta Path Neighbors (sorted by count):
Account 37: [(2170, 1)]
Account 2170: [(37, 1)]
Account 188: [(5298, 1)]
Account 5298: [(188, 1)]
Account 246: [(1085, 1)]
Account 1085: [(246, 1), (120, 1)]
Account 262: [(1805, 1)]
Account 1805: [(262, 1)]
Account 308: [(3535, 1)]
Account 3535: [(308, 1)]
Account 317: [(4588, 1), (492, 1)]
Account 4588: [(317, 1)]
Account 319: [(2888, 1)]
Account 2888: [(319, 1), (2261, 1)]
Account 347: [(4840, 1), (690, 1)]
Account 4840: [(347, 1)]
Account 359: [(3771, 1), (5049, 1)]
Account 3771: [(359, 1), (5049, 1)]
Account 5049: [(359, 1), (3771, 1)]
Account 438: [(1532, 1)]
Account 1532: [(438, 1)]
Account 467: [(1191, 1), (5524, 1), (2720, 1)]
Account 1191: [(467, 1), (5524, 1), (5382, 1)]
Account 5524: [(467, 1), (1191, 1), (2825, 1), (3212, 1), (3599, 1), (5472, 1)]
Account 586: [(4833, 1)]
Account 4833: [(586, 1)]
Account 605: [(619, 1), (1554, 1)]
Account 619: [(605, 1), (1554, 1)]
Account 1554: [(605, 1), (619, 1)]
Account 607: [(1512, 1), (

In [19]:
count=0
most_n = 0
n_list = []
for account, neighbors in sorted_neighbors.items():
    _, most_c = neighbors[0]
    n_n = len(neighbors)
    if n_n>most_n:
        most_n = n_n
    if most_c>1:
        count+=1
    n_list.append(n_n)

# print(count)
print(most_n)
print(np.mean(n_list))
print(np.min(n_list))

17
3.514705882352941
1


In [20]:
accounts = list(combined_neighbors.keys())
account_index = {account: idx for idx, account in enumerate(accounts)}

# 初始化零矩阵
n = len(accounts)
adj_matrix = np.zeros((n, n), dtype=int)

# 填充邻接矩阵
for account, neighbors in combined_neighbors.items():
    for neighbor, count in neighbors.items():
        i = account_index[account]
        j = account_index[neighbor]
        adj_matrix[i, j] = count

# 打印邻接矩阵
print("Adjacency Matrix:")
print(adj_matrix)

Adjacency Matrix:
[[0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]]


In [21]:
adj_matrix[adj_matrix > 0] = 1
pos = adj_matrix
print(pos.shape)

(544, 544)


In [47]:
sparse_matrix = sp.csr_matrix(pos)
sp.save_npz(PATH+'pos.npz', sparse_matrix)

In [37]:
positive_idx = []
negative_idx = []

for i, l in enumerate(label):
    if l == 0:
        negative_idx.append(i)
    else:
        positive_idx.append(i)

print(len(positive_idx))
print(len(negative_idx))

random.seed(0)
train_idx = random.sample(positive_idx, int(len(positive_idx) * 0.7)) + random.sample(negative_idx, int(len(negative_idx) * 0.7))
val_test_idx = list(set(range(len(label))) - set(train_idx))
val_idx = random.sample(val_test_idx, len(val_test_idx) // 2)
test_idx = list(set(val_test_idx) - set(val_idx))

print(len(train_idx))
print(val_idx)



587
4971
3889
[2149, 1287, 4338, 790, 4192, 4588, 4711, 1665, 2339, 5446, 2662, 5226, 689, 1884, 5272, 2580, 3748, 2382, 218, 4357, 2210, 1572, 2626, 230, 5150, 2834, 3633, 1845, 5301, 5447, 1087, 1795, 303, 5076, 1826, 4922, 516, 2225, 4433, 2804, 51, 5267, 4643, 1821, 1533, 4481, 5303, 4844, 1217, 1745, 2469, 3547, 4335, 774, 1159, 3388, 1338, 3823, 4126, 4007, 493, 2731, 1950, 4740, 5402, 279, 1777, 39, 207, 4104, 1072, 2194, 1164, 2702, 1599, 3940, 3693, 1328, 215, 4387, 3667, 5130, 5552, 3061, 3688, 4499, 3030, 4004, 4200, 5415, 2297, 2443, 794, 118, 2820, 2428, 1272, 3288, 1553, 4919, 4651, 4992, 4354, 5101, 4932, 4310, 2567, 4045, 106, 4558, 1283, 4169, 5202, 3378, 2769, 5156, 2615, 1111, 1036, 4645, 2900, 2538, 2998, 5006, 4915, 5281, 5102, 1450, 1740, 430, 4784, 5311, 3831, 27, 2047, 5275, 1264, 671, 5473, 1721, 1195, 2656, 1773, 3535, 4412, 5472, 2146, 3640, 4959, 2858, 3349, 3148, 807, 1722, 4095, 2123, 1774, 5122, 4918, 1948, 2505, 4118, 3480, 168, 3406, 4420, 5001, 3422, 1

In [48]:
np.save(PATH+'train_70.npy', np.array(train_idx, dtype=np.int32))
np.save(PATH+'val_70.npy', np.array(val_idx, dtype=np.int32))
np.save(PATH+'test_70.npy', np.array(test_idx, dtype=np.int32))