In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from collections import defaultdict
import random
import scipy.sparse as sp

In [3]:
# Define the path to the data
PATH = '/afs/csail.mit.edu/u/l/leihuang/project/HeCo/data/huya_1w/'

Two ways to obtain the feature:
- convert_to_idx: 1D feature (if there are too many different categories like userip)
- one hot: 2D feature

In [4]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False)

In [5]:
# convert the continuous features to discrete features
def convert_to_index(a):
    a_to_index = {}
    index = 0
    result = []

    for e in a:
        if e not in a_to_index:
            a_to_index[e] = index
            index += 1
        result.append(a_to_index[e])

    return a_to_index

In [6]:
# load the data
df = pd.read_csv('/afs/csail.mit.edu/u/l/leihuang/project/HeCo/data/huya/注册数据-1w+.csv', encoding='latin-1')
df['num'] = range(len(df))

In [7]:
print(df.columns)

Index(['Unnamed: 0', 'time', 'event', 'userip', 'link_all_userip_country',
       'link_all_userip_province', 'link_all_userip_city', 'devicename',
       'appid', 'appver', 'user_agent', 'page', 'behavior', 'account_type',
       'mobile', 'link_all_mobile_country', 'link_all_mobile_province',
       'link_all_mobile_city', 'link_all_mobile_operators', 'terminal', 'emu',
       'serial', 'fingerprint', 'model', 'brand', 'os_version', 'sdk_version',
       'memory', 'extennalavail', 'externaltotal', 'internalavail',
       'internaltotal', 'disktotal', 'diskavailable', 'batterlevel',
       'screenheight', 'screenwidth', 'canvas', 'timezone', 'language',
       'fonts', 'plugins', 'screenresolution', 'system_name', 'cpu_des',
       'cpu_name', 'diskinfo', 'network_product', 'video_product',
       'board_product', 'totalvirtualmemorysize', 'numberofprocesses',
       'freephysicalmemory', 'totalvisiblememorysize', 'machine',
       'user_action', 'applist', 'dt', 'label_1', 'label_7',

## Obtain the feature

In [8]:
link_all_userip_province = df['link_all_userip_province']
province_feature  = label_encoder.fit_transform(link_all_userip_province)
province_feature = province_feature.reshape(-1, 1)
province_feature = onehot_encoder.fit_transform(province_feature)
print(province_feature)
print(province_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(11196, 58)


In [9]:
link_all_userip_city = df['link_all_userip_city']
city_feature  = label_encoder.fit_transform(link_all_userip_city)
city_feature = city_feature.reshape(-1, 1)
city_feature = onehot_encoder.fit_transform(city_feature)
print(city_feature)
print(city_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(11196, 339)


In [10]:
appver = df['appver']
appver_feature  = label_encoder.fit_transform(appver)
appver_feature = appver_feature.reshape(-1, 1)
appver_feature = onehot_encoder.fit_transform(appver_feature)
print(appver_feature)
print(appver_feature.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(11196, 224)


In [11]:
link_all_mobile_operators = df['link_all_mobile_operators']
mobile_operators_feature  = label_encoder.fit_transform(link_all_mobile_operators)
mobile_operators_feature = mobile_operators_feature.reshape(-1, 1)
mobile_operators_feature = onehot_encoder.fit_transform(mobile_operators_feature)
print(mobile_operators_feature)
print(mobile_operators_feature.shape)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
(11196, 7)


In [12]:
os_version = df['os_version']
os_version_feature  = label_encoder.fit_transform(os_version)
os_version_feature = os_version_feature.reshape(-1, 1)
os_version_feature = onehot_encoder.fit_transform(os_version_feature)
print(os_version_feature)
print(os_version_feature.shape)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]]
(11196, 119)


In [13]:
ip = df['userip'].unique()
print(len(ip))
ip_dict = convert_to_index(ip)
print(ip_dict)

6810
{'182.118.237.129': 0, '112.51.183.46': 1, '223.104.165.166': 2, '121.230.76.74': 3, '117.136.46.143': 4, '111.29.80.124': 5, '113.57.182.98': 6, '116.10.22.153': 7, '183.39.33.112': 8, '27.190.92.154': 9, '112.39.10.251': 10, '112.21.143.86': 11, '222.139.108.32': 12, '113.71.249.125': 13, '117.136.12.203': 14, '39.144.218.177': 15, '117.136.12.149': 16, '120.242.30.3': 17, '218.77.18.59': 18, '39.154.106.252': 19, '60.180.233.105': 20, '171.222.77.35': 21, '27.128.107.52': 22, '106.7.138.97': 23, '112.252.2.204': 24, '114.217.165.32': 25, '112.224.22.9': 26, '222.209.102.91': 27, '112.48.42.168': 28, '119.41.192.45': 29, '116.116.82.146': 30, '61.146.77.186': 31, '120.228.187.58': 32, '183.230.249.87': 33, '114.234.32.14': 34, '153.0.2.89': 35, '222.209.164.161': 36, '119.179.165.78': 37, '171.41.91.41': 38, '219.133.100.245': 39, '121.26.8.118': 40, '144.12.103.170': 41, '42.100.225.132': 42, '125.86.106.254': 43, '223.104.148.56': 44, '118.254.23.101': 45, '112.47.179.226': 46

In [14]:
terminal = df['terminal'].unique()
print(len(terminal))
terminal_dict = convert_to_index(terminal)
print(terminal_dict)

5
{nan: 0, 'ios': 1, 'android': 2, 'pc': 3, 'WEB': 4}


In [15]:
# features like this are too many, we need to just use the index to represent them
mobile = df['mobile'].unique()
print(len(mobile))
mobile_dict = convert_to_index(mobile)
print(mobile_dict)

5533
{'0861378239****': 0, '0861825960****': 1, '0861785887****': 2, '0861626486****': 3, '0861572065****': 4, '0861702640****': 5, '0861822291****': 6, '0861752755****': 7, '0861897775****': 8, '0861800172****': 9, '0861863342****': 10, '0861702680****': 11, '0861702045****': 12, '0861394056****': 13, '0861570615****': 14, '0861702079****': 15, '0861622395****': 16, '0861523881****': 17, '0861700065****': 18, '0861678545****': 19, '0861343545****': 20, '0861347976****': 21, '0861362847****': 22, '0861599240****': 23, '0861815427****': 24, '0861820667****': 25, '0861804787****': 26, '0861505894****': 27, '0861519673****': 28, '0861993062****': 29, '0861517967****': 30, '0861556434****': 31, '0861625780****': 32, '0861660625****': 33, '0861856022****': 34, '0861938160****': 35, '0861817675****': 36, '0861652141****': 37, '0861702327****': 38, '0861761495****': 39, '0861812897****': 40, '0861561663****': 41, '0861832524****': 42, '0861657135****': 43, '0861701791****': 44, '0861702326***

#### Obtain the label

In [16]:
label = df['label_7'].tolist()
print(len(label))
label = np.array(label, dtype=np.int32)
np.save(PATH+'labels.npy', label)

11196


### Construct the neighbor

Please edit the name of the neighbour dic, for example, the neighbour of ip should be nei_ip

In [17]:
### ip node nighbour
nei_ip = []

### terminal node neighbour
nei_terminal = []

In [18]:
def get_nei(node, df, node_dict):
    nei = []
    for index, row in df.iterrows():
        nei.append([node_dict[row[node]]])
    return nei


In [19]:
nei_ip = get_nei('userip', df, ip_dict)
nei_terminal = get_nei('terminal', df, terminal_dict)
print(nei_terminal[:10])

[[0], [1], [1], [0], [1], [0], [1], [2], [1], [1]]


In [20]:
nei_mobile = get_nei('mobile', df, mobile_dict) 
np.save(PATH+'nei_mobile.npy', nei_mobile)


In [17]:
# for index, row in df.iterrows():
#     nei_ip.append(np.array([ip_dict[row['userip']]]))
#     nei_terminal.append(np.array([terminal_dict[row['terminal']]]))


In [21]:
print(nei_ip[:10])
print(np.max(nei_ip))
print(nei_terminal[:10])
np.save(PATH+'nei_i.npy', nei_ip)
np.save(PATH+'nei_t.npy', nei_terminal)

[[0], [1], [2], [3], [4], [3], [5], [6], [7], [8]]
6809
[[0], [1], [1], [0], [1], [0], [1], [2], [1], [1]]


In [23]:
nei_ip = np.load(PATH+'nei_i.npy')
print(nei_ip[:10])

[[0]
 [1]
 [2]
 [3]
 [4]
 [3]
 [5]
 [6]
 [7]
 [8]]


In [24]:
feat_account = np.concatenate((province_feature, city_feature, appver_feature, mobile_operators_feature, os_version_feature), axis=1)
print(feat_account.shape) 

saprse_feat_account = sp.csr_matrix(feat_account)
sp.save_npz(PATH+'feat_a.npz', saprse_feat_account)

(11196, 747)


In [38]:
import socket
import struct

def ip_to_int(ip):
    return struct.unpack("!I", socket.inet_aton(ip))[0]


# 转换为整数,可以选择这种方式，相当于直接使用它的value，也可以选择前面写的convert_to_index函数直接使用或者搭配上onehotencoder
# ip_int_list = [ip_to_int(i) for i in ip]
print(len(ip))
ip_int_list = convert_to_index(ip)
print(len(ip_int_list))



ip_int_list = np.array(list(ip_int_list.values()))

feat_ip = ip_int_list.reshape(-1, 1)
print(feat_ip)
print(feat_ip.shape)



6810
6810
[[   0]
 [   1]
 [   2]
 ...
 [6807]
 [6808]
 [6809]]


In [26]:
feat_terminal = label_encoder.fit_transform(nei_terminal)
feat_terminal = feat_terminal.reshape(-1, 1)
feat_terminal = onehot_encoder.fit_transform(feat_terminal)
print(feat_terminal)
print(feat_terminal.shape)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]]
(11196, 5)


  y = column_or_1d(y, warn=True)


In [39]:
feat_mobile = np.array(nei_mobile).reshape(-1, 1)
print(feat_mobile.shape)

(11196, 1)


In [41]:
# np.save(PATH+'feat_i.npy', feat_ip)
# np.save(PATH+'feat_t.npy', feat_terminal)

In [42]:
accounts = df['num'].unique()
account_index = {account: idx for idx, account in enumerate(accounts)}
terminal = df['terminal'].unique()
terminal_index = {terminal: idx for idx, terminal in enumerate(terminal)}
aia = np.zeros((len(accounts), len(accounts)), dtype=int) #account-ip-account meta path
ata = np.zeros((len(accounts), len(accounts)), dtype=int) #account-terminal-account meta path
ama = np.zeros((len(accounts), len(accounts)), dtype=int) #account-mobile-account meta path
ip_to_accounts = defaultdict(list)
terminal_to_accounts = defaultdict(list)
mobile_to_accounts = defaultdict(list)
for _, row in df.iterrows():
    ip_to_accounts[row['userip']].append(row['num'])
    terminal_to_accounts[row['terminal']].append(row['num'])
    mobile_to_accounts[row['mobile']].append(row['num'])

for accounts in ip_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            aia[idx_i][idx_j] += 1
            aia[idx_j][idx_i] += 1

for accounts in terminal_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            ata[idx_i][idx_j] += 1
            ata[idx_j][idx_i] += 1

for accounts in mobile_to_accounts.values():
    for i in range(len(accounts)):
        for j in range(i + 1, len(accounts)):
            idx_i = account_index[accounts[i]]
            idx_j = account_index[accounts[j]]
            ama[idx_i][idx_j] += 1
            ama[idx_j][idx_i] += 1

print(aia.shape)
print(ata.shape)
print(ama.shape)

(11196, 11196)
(11196, 11196)
(11196, 11196)


In [43]:
sparse_aia = sp.csr_matrix(aia)
sparse_ata = sp.csr_matrix(ata)
sparse_ama = sp.csr_matrix(ama)
sp.save_npz(PATH+'aia.npz', sparse_aia)
sp.save_npz(PATH+'ata.npz', sparse_ata)
sp.save_npz(PATH+'ama.npz', sparse_ama)

!!!! 后面的要相应替换一下，把不用的注释掉,比如我下面这里注释掉了ata

In [44]:
def get_meta_path_neighbors(df, column):
    # 创建一个字典，键为中间节点（IP或Terminal），值为访问该节点的账户列表
    node_to_accounts = defaultdict(list)
    for _, row in df.iterrows():
        node_to_accounts[row[column]].append(row['num'])

    # 创建一个字典，键为账户，值为通过中间节点连接的其他账户及其出现次数
    account_meta_path = defaultdict(lambda: defaultdict(int))
    for accounts in node_to_accounts.values():
        for i in range(len(accounts)):
            for j in range(len(accounts)):
                if i != j:
                    account_meta_path[accounts[i]][accounts[j]] += 1

    return account_meta_path

# 获取 account-ip-account 的邻居统计
aia_neighbors = get_meta_path_neighbors(df, 'userip')
print(len(aia_neighbors))

# 获取 account-terminal-account 的邻居统计
ata_neighbors = get_meta_path_neighbors(df, 'terminal')
print(len(ata_neighbors))

# # 获取 account-mobile-account 的邻居统计
ama_neighbors = get_meta_path_neighbors(df, 'mobile')
print(len(ama_neighbors))

# # 打印结果
# print("Account-IP-Account Neighbors:")
# for account, neighbors in aia_neighbors.items():
#     print(f"Account {account}: {dict(neighbors)}")

# # print("\nAccount-Terminal-Account Neighbors:")
# # for account, neighbors in ata_neighbors.items():
# #     print(f"Account {account}: {dict(neighbors)}")

# print("\nAccount-Mobile-Account Neighbors:")
# for account, neighbors in ama_neighbors.items():
#     print(f"Account {account}: {dict(neighbors)}")

4533
11196
5779


In [45]:
combined_neighbors = defaultdict(lambda: defaultdict(int))

for account, neighbors in aia_neighbors.items():
    for neighbor, count in neighbors.items():
        combined_neighbors[account][neighbor] += count

for account, neighbors in ata_neighbors.items():
    for neighbor, count in neighbors.items():
        combined_neighbors[account][neighbor] += count

for account, neighbors in ama_neighbors.items():
    for neighbor, count in neighbors.items():
        combined_neighbors[account][neighbor] += count

# 将结果转换为列表并排序
sorted_neighbors = {}
for account, neighbors in combined_neighbors.items():
    sorted_neighbors[account] = sorted(neighbors.items(), key=lambda item: item[1], reverse=True)

# # 打印结果
# print("Combined Meta Path Neighbors (sorted by count):")
# for account, neighbors in sorted_neighbors.items():
#     print(f"Account {account}: {neighbors}")

In [46]:
count=0
most_n = 0
n_list = []
for account, neighbors in sorted_neighbors.items():
    _, most_c = neighbors[0]
    n_n = len(neighbors)
    if n_n>most_n:
        most_n = n_n
    if most_c>1:
        count+=1
    n_list.append(n_n)

# print(count)
print(most_n)
print(np.mean(n_list))
print(np.min(n_list))

5599
3799.1457663451233
3


In [47]:
accounts = list(combined_neighbors.keys())
account_index = {account: idx for idx, account in enumerate(accounts)}

# 初始化零矩阵
n = len(accounts)
adj_matrix = np.zeros((n, n), dtype=int)

# 填充邻接矩阵
for account, neighbors in combined_neighbors.items():
    for neighbor, count in neighbors.items():
        i = account_index[account]
        j = account_index[neighbor]
        adj_matrix[i, j] = count

# 打印邻接矩阵
print("Adjacency Matrix:")
print(adj_matrix)
print(adj_matrix.shape)

Adjacency Matrix:
[[0 2 2 ... 0 0 0]
 [2 0 2 ... 0 0 0]
 [2 2 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]]
(11196, 11196)


In [48]:
adj_matrix[adj_matrix > 0] = 1
pos = adj_matrix
print(pos.shape)

(11196, 11196)


In [49]:
sparse_matrix = sp.csr_matrix(pos)
sp.save_npz(PATH+'pos.npz', sparse_matrix)

In [50]:
positive_idx = []
negative_idx = []

for i, l in enumerate(label):
    if l == 0:
        negative_idx.append(i)
    else:
        positive_idx.append(i)

print(len(positive_idx))
print(len(negative_idx))




4759
6437


### Split 70% training set

In [None]:
random.seed(0)
train_idx = random.sample(positive_idx, int(len(positive_idx) * 0.7)) + random.sample(negative_idx, int(len(negative_idx) * 0.7))
val_test_idx = list(set(range(len(label))) - set(train_idx))
val_idx = random.sample(val_test_idx, len(val_test_idx) // 2)
test_idx = list(set(val_test_idx) - set(val_idx))

print(len(train_idx))
print(val_idx)


In [48]:
np.save(PATH+'train_70.npy', np.array(train_idx, dtype=np.int32))
np.save(PATH+'val_70.npy', np.array(val_idx, dtype=np.int32))
np.save(PATH+'test_70.npy', np.array(test_idx, dtype=np.int32))

### Split 20% training set

In [51]:
random.seed(0)
train_idx = random.sample(positive_idx, int(len(positive_idx) * 0.2)) + random.sample(negative_idx, int(len(negative_idx) * 0.2))
val_test_idx = list(set(range(len(label))) - set(train_idx))
val_idx = random.sample(val_test_idx, len(val_test_idx) // 2)
test_idx = list(set(val_test_idx) - set(val_idx))

print(len(train_idx))
print(val_idx)

2238
[489, 171, 5500, 1573, 6421, 6498, 5898, 1005, 5083, 10586, 2896, 5299, 1157, 7960, 8578, 5749, 4597, 5181, 6414, 10910, 10027, 9200, 2550, 8679, 7673, 3471, 10667, 1057, 4710, 9774, 7852, 647, 4864, 1673, 6322, 2032, 3740, 2666, 8966, 318, 7425, 10760, 8484, 2583, 10795, 5042, 10252, 5769, 5482, 8287, 8848, 9640, 1205, 3365, 10736, 7314, 4143, 5090, 3114, 4514, 6160, 3003, 3813, 5113, 581, 3713, 4738, 8554, 3217, 4974, 2668, 3441, 5067, 10731, 6133, 447, 9438, 4087, 1286, 8444, 3438, 7802, 10366, 7946, 4341, 7876, 10919, 10773, 10105, 2126, 2628, 1770, 11174, 244, 1689, 224, 4967, 8121, 9238, 5643, 8452, 7075, 8129, 5275, 1582, 3235, 4957, 4293, 943, 1101, 10482, 1378, 9150, 5086, 9243, 2784, 10446, 5931, 281, 3048, 7195, 7327, 7524, 6093, 10290, 5182, 2767, 480, 9819, 4084, 8143, 6958, 9697, 6174, 9028, 2883, 9733, 6066, 6806, 7154, 7795, 11076, 8232, 10285, 6103, 4795, 3864, 8845, 4414, 5410, 10543, 3429, 11050, 2900, 10600, 10884, 10381, 9744, 7003, 5289, 8885, 6182, 9345, 107

In [52]:
print(len(label))

11196


In [53]:
print(len(test_idx))
print(len(val_idx))

4479
4479


In [54]:
np.save(PATH+'train_20.npy', np.array(train_idx, dtype=np.int32))
np.save(PATH+'val_20.npy', np.array(val_idx, dtype=np.int32))
np.save(PATH+'test_20.npy', np.array(test_idx, dtype=np.int32))

In [56]:
feat_a = sp.load_npz('/afs/csail.mit.edu/u/l/leihuang/project/HeCo/data/huya_1w/pos.npz')
print(feat_a.shape)

(11196, 11196)
