# 说明，以下代码环境均是针对于kaggle平台

1. 首先读取相关的文件并确保版本是否符合要求

In [None]:
import platform
import pickle
import networkx as nx

In [None]:
print("python version:\t\t", platform.python_version())
print("networkx version:\t", nx.__version__)

In [None]:
def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

G = load_pickle('../input/ethereum-phishing-transaction-network/Ethereum Phishing Transaction Network/MulDiGraph.pkl')
print(nx.info(G))

2. 设计代码完成对node data的提取，并进行数据的初步筛选，根据transactions进行筛选，若transactions大于1000或者小于10均除去相关的数据

In [None]:
legal_node_set=set()
for ind,nd in enumerate(nx.nodes(G)):
    count=0
    suc=list(G.successors(nd))
    pre=list(G.predecessors(nd))
    for i in range(len(suc)):
        count+=(G.number_of_edges(nd,suc[i]))
    for j in range(len(pre)):
        count+=(G.number_of_edges(pre[j],nd))
    if count<10 or count>1000:
        continue
    legal_node_set.add((nd,count))
print(len(legal_node_set))

3. 生成label的csv文件并统计信息

In [None]:
import csv
legal_node_list=list(legal_node_set)
with open('label.csv','w',newline='') as file:
    writer=csv.writer(file)
    writer.writerow(['account','isp'])
    for i in range(len(legal_node_list)):
        data=[]
        data.append(legal_node_list[i][0])
        data.append(G.nodes[legal_node_list[i][0]]['isp'])
        writer.writerow(data)

In [None]:
phishingCount=0
for i in range(len(legal_node_list)):
    if G.nodes[legal_node_list[i][0]]['isp']==1:
        phishingCount+=1
print(phishingCount)
print(phishingCount/len(legal_node_list))
print(len(legal_node_list)-phishingCount)

4. 生成对应的node_data特征文件

In [None]:
import statistics
node_data=[]
for i in range(len(legal_node_list)):
    node=legal_node_list[i][0]
    node_data_temp=[]
    time=[]
    time_min=99999999
    time_max=0
    amount=[]
    in_time=[]
    in_time_min=99999999
    in_time_max=0
    in_amount=[]
    node_set=set()
    pre=list(G.predecessors(node))
    for j in range(len(pre)):
        for v in G[pre[j]][node].values():
            in_time.append(v['amount'])
            in_amount.append(v['timestamp'])
            if v['timestamp']>in_time_max:
                in_time_max=v['timestamp']
            if v['timestamp']<in_time_min:
                in_time_min=v['timestamp']

    for i,v in dict(G[node]).items():
        node_set.add(i)
        for v1 in v.values():
            time.append(v1['timestamp'])
            amount.append(v1['amount'])
            if v1['timestamp']>time_max:
                time_max=v1['timestamp']
            if v1['timestamp']<time_min:
                time_min=v1['timestamp']
    if len(time)>0:
        out_block_std=statistics.pstdev(time)
        out_time_ptp=time_max-time_min
        amount_sum=sum(amount)
        amount_max=max(amount)
        amount_min=min(amount)
        amount_mean=statistics.mean(amount)
        amount_std=statistics.pstdev(amount)
        count=len(time)
        unique=len(node_set)
        unique_ratio=unique/count
    if len(time)<=0 :
        out_block_std=0
        out_time_ptp=0
        amount_sum=0
        amount_max=0
        amount_min=0
        amount_mean=0
        amount_std=0
        count=0
        unique=0
        unique_ratio=0
    if len(in_time)>0:
        in_block_std=statistics.pstdev(in_time)
        in_time_ptp=in_time_max-in_time_min
        in_amount_sum=sum(in_amount)
        in_amount_max=max(in_amount)
        in_amount_min=min(in_amount)
        in_amount_mean=statistics.mean(in_amount)
        in_amount_std=statistics.pstdev(in_amount)
        in_count=len(in_time)
        in_unique=len(node_set)
        in_unique_ratio=in_unique/in_count
    if len(in_time)<=0:
        in_block_std=0
        in_time_ptp=0
        in_amount_sum=0
        in_amount_max=0
        in_amount_min=0
        in_amount_mean=0
        in_amount_std=0
        in_count=0
        in_unique=0
        in_unique_ratio=0
    node_data_temp.append(node)
    node_data_temp.append(out_block_std)
    node_data_temp.append(out_time_ptp)
    node_data_temp.append(amount_sum)
    node_data_temp.append(amount_max)
    node_data_temp.append(amount_min)
    node_data_temp.append(amount_mean)
    node_data_temp.append(amount_std)
    node_data_temp.append(count)
    node_data_temp.append(unique)
    node_data_temp.append(unique_ratio)
    node_data_temp.append(in_block_std)
    node_data_temp.append(in_time_ptp)
    node_data_temp.append(in_amount_sum)
    node_data_temp.append(in_amount_max)
    node_data_temp.append(in_amount_min)
    node_data_temp.append(in_amount_mean)
    node_data_temp.append(in_amount_std)
    node_data_temp.append(in_count)
    node_data_temp.append(in_unique)
    node_data_temp.append(in_unique_ratio)





    node_data.append(node_data_temp)




In [None]:
import csv
with open('node_data_features.csv','w',newline='') as file:
    writer=csv.writer(file)
    writer.writerow(['account','out_block_std','out_time_ptp','out_amount_sum','out_amount_max','out_amount_min'
                    ,'out_amount_mean','out_amount_std','out_count','out_unique',
                    'out_unique_ratio',
                    'in_block_std','in_time_ptp','in_amount_sum','in_amount_max','in_amount_min'
                    ,'in_amount_mean','in_amount_std','in_count','in_unique',
                    'in_unique_ratio'

                    ])
    for i in range(len(node_data)):
        writer.writerow(node_data[i])

5. 验证生成是否有误

In [None]:
for i in range(len(legal_node_list)):
    if legal_node_list[i][0]!=node_data[i][0] or (legal_node_list[i][1]!=
                                                 node_data[i][8]+node_data[i][18]):
        print("出现错误")
print("没问题")

# 2 针对本地电脑提取特征


In [1]:
import platform
import pickle
import networkx as nx

In [None]:
print("python version:\t\t", platform.python_version())
print("networkx version:\t", nx.__version__)

In [5]:
def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

G = load_pickle('./MulDiGraph.pkl')
print(nx.info(G))

MemoryError: 

In [3]:
import random
random.seed(2973489)
random_list=[]
for i in range(10):
    random_list.append(random.randint(1,2973489))
print(random_list)

[1921733, 2917988, 2275260, 1829473, 2554247, 1503928, 1300414, 1314603, 2495979, 1681630]


# 3 采用随机游走算法提取特征

In [None]:
import platform
import pickle
import networkx as nx

In [None]:
def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

In [None]:
import random
random.seed(2973489)
random_list=[]
for i in range(10):
    random_list.append(random.randint(1,2973489))
print(random_list)

In [None]:
import random

def RandomSelectNode(G):

    legal_node_set=set()
    for ind,nd in enumerate(nx.nodes(G)):

        if ind>=random_list[0]:
            
            legal_node_set.add(nd)
            suc=list(G.successors(nd))
            pre=list(G.predecessors(nd))

            for i in range(len(suc)):
                if len(legal_node_set)>=2600:
                    break
                legal_node_set.add(suc[i])

            for i in range(len(pre)):
                if len(legal_node_set)>=2600:
                    break
                legal_node_set.add(pre[i])


            while len(legal_node_set)<2600:
                isWho=random.randint(0,1)
                isOut=False
                node_select=None
                if len(suc)==0:
                    isWho=1
                if len(pre)==0:
                    isWho=0
                if isWho==0:
                    choose=random.randint(0,len(suc)-1)
                    node_select=suc[choose]
                if isWho==1:
                    choose=random.randint(0,len(pre)-1)
                    node_select=pre[choose]
                legal_node_set.add(node_select)
                suc=list(G.successors(node_select))
                pre=list(G.predecessors(node_select))
                for i in range(len(suc)):
                    if len(legal_node_set)>=2600:
                        isOut=True
                        break
                    legal_node_set.add(suc[i])

                if isOut:
                    break
                for i in range(len(pre)):
                    if len(legal_node_set)>=2600:
                        isOut=True
                        break
                    legal_node_set.add(pre[i])

                if isOut:
                    break
            if len(legal_node_set)>=2600:
                break
    return legal_node_set

In [None]:
G = load_pickle('../input/ethereum-phishing-transaction-network/Ethereum Phishing Transaction Network/MulDiGraph.pkl')
legal_node_set=RandomSelectNode(G)

In [None]:
legal_node_list=list(legal_node_set)

In [None]:
phishingCount=0
for i in range(len(legal_node_list)):
    if G.nodes[legal_node_list[i]]['isp']==1:
        phishingCount+=1
print(phishingCount)
print(phishingCount/len(legal_node_list))
print(len(legal_node_list)-phishingCount)

In [None]:
import csv

def RandomLabel(legal_node_set):

    legal_node_list=list(legal_node_set)
    with open('RandomLabel1.csv','w',newline='') as file:
        writer=csv.writer(file)
        writer.writerow(['account','isp'])
        for i in range(len(legal_node_list)):
            data=[]
            data.append(legal_node_list[i])
            data.append(G.nodes[legal_node_list[i]]['isp'])
            writer.writerow(data)

In [None]:
RandomLabel(legal_node_set)

 # 统计节点信息

In [None]:
import statistics

def getNodeData(legal_node_list):
    node_data=[]
    for i in range(len(legal_node_list)):
        node=legal_node_list[i]
        node_data_temp=[]
        time=[]
        time_min=99999999
        time_max=0
        amount=[]
        in_time=[]
        in_time_min=99999999
        in_time_max=0
        in_amount=[]
        node_set=set()
        pre=list(G.predecessors(node))
        for j in range(len(pre)):
            for v in G[pre[j]][node].values():
                in_time.append(v['amount'])
                in_amount.append(v['timestamp'])
                if v['timestamp']>in_time_max:
                    in_time_max=v['timestamp']
                if v['timestamp']<in_time_min:
                    in_time_min=v['timestamp']


        for i,v in dict(G[node]).items():
            node_set.add(i)
            for v1 in v.values():
                time.append(v1['timestamp'])
                amount.append(v1['amount'])
                if v1['timestamp']>time_max:
                    time_max=v1['timestamp']
                if v1['timestamp']<time_min:
                    time_min=v1['timestamp']

        if len(time)>0:
            out_block_std=statistics.pstdev(time)
            out_time_ptp=time_max-time_min
            amount_sum=sum(amount)
            amount_max=max(amount)
            amount_min=min(amount)
            amount_mean=statistics.mean(amount)
            amount_std=statistics.pstdev(amount)
            count=len(time)
            unique=len(node_set)
            unique_ratio=unique/count

        if len(time)<=0 :
            out_block_std=0
            out_time_ptp=0
            amount_sum=0
            amount_max=0
            amount_min=0
            amount_mean=0
            amount_std=0
            count=0
            unique=0
            unique_ratio=0

        if len(in_time)>0:
            in_block_std=statistics.pstdev(in_time)
            in_time_ptp=in_time_max-in_time_min
            in_amount_sum=sum(in_amount)
            in_amount_max=max(in_amount)
            in_amount_min=min(in_amount)
            in_amount_mean=statistics.mean(in_amount)
            in_amount_std=statistics.pstdev(in_amount)
            in_count=len(in_time)
            in_unique=len(node_set)
            in_unique_ratio=in_unique/in_count

        if len(in_time)<=0:
            in_block_std=0
            in_time_ptp=0
            in_amount_sum=0
            in_amount_max=0
            in_amount_min=0
            in_amount_mean=0
            in_amount_std=0
            in_count=0
            in_unique=0
            in_unique_ratio=0

        node_data_temp.append(node)
        node_data_temp.append(out_block_std)
        node_data_temp.append(out_time_ptp)
        node_data_temp.append(amount_sum)
        node_data_temp.append(amount_max)
        node_data_temp.append(amount_min)
        node_data_temp.append(amount_mean)
        node_data_temp.append(amount_std)
        node_data_temp.append(count)
        node_data_temp.append(unique)
        node_data_temp.append(unique_ratio)

        node_data_temp.append(in_block_std)
        node_data_temp.append(in_time_ptp)
        node_data_temp.append(in_amount_sum)
        node_data_temp.append(in_amount_max)
        node_data_temp.append(in_amount_min)
        node_data_temp.append(in_amount_mean)
        node_data_temp.append(in_amount_std)
        node_data_temp.append(in_count)
        node_data_temp.append(in_unique)
        node_data_temp.append(in_unique_ratio)





        node_data.append(node_data_temp)
    return node_data


In [None]:
node_data=getNodeData(legal_node_list)
print(node_data[0])

# 统计一阶朋友节点重要特征

In [None]:
import statistics

def getAllToFriendsOutDirectionFeatures(legal_node_list):
    To_Out_Fea=[]
    for i in range(len(legal_node_list)):
        To_Out_Fea_Temp=[]
        To_Out_Fea_Temp.append(legal_node_list[i])
        suc=list(G.successors(legal_node_list[i]))
        to_out_sum_media=0
        to_out_min_std=0
        to_out_sum_min=0
        media=[]
        std=[]
        min_list=[]
        for j in range(len(suc)):
            cal_amount=[]

            temp_min=999999999
            for i,v in dict(G[suc[j]]).items():
                for v1 in v.values():
                    cal_amount.append(v1['amount'])
                    if temp_min>v1['amount']:
                        temp_min=v1['amount']
            if len(cal_amount)>0:
                media.append(statistics.mean(cal_amount))
                std.append(statistics.pstdev(cal_amount))
                min_list.append(temp_min)
        if len(media)>0:
            to_out_sum_media=sum(media)
        if len(media)<=0:
            to_out_sum_media=0
        if len(std)>0:
            to_out_min_std=min(std)
        if len(std)<=0:
            to_out_min_std=0
        if len(min_list)>0:
            to_out_sum_min=sum(min_list)
        if len(min_list)<=0:
            to_out_sum_min=0
        To_Out_Fea_Temp.append(to_out_sum_media)
        To_Out_Fea_Temp.append(to_out_min_std)
        To_Out_Fea_Temp.append(to_out_sum_min)

        To_Out_Fea.append(To_Out_Fea_Temp)
    return  To_Out_Fea



In [None]:
To_Out_Fea=getAllToFriendsOutDirectionFeatures(legal_node_list)
print(To_Out_Fea[0])

In [None]:
import statistics
def getAllFromFriendsInDirectionFeatures(legal_node_list):
    From_In_Fea=[]
    for i in range(len(legal_node_list)):
        From_In_Fea_Temp=[]
        From_In_Fea_Temp.append(legal_node_list[i])
        pre=list(G.predecessors(legal_node_list[i]))
        from_in_sum_min=0
        from_in_min_std=0
        from_in_min_sum=0
        from_in_mean_sum=0
        sum_list=[]
        std=[]
        min_list=[]
        for j in range(len(pre)):
            cal_amount=[]
            temp_min=9999999
            preFriend=list(G.predecessors(pre[j]))
            for k in range(len(preFriend)):
                for v in G[preFriend[k]][pre[j]].values():
                    cal_amount.append(v['amount'])
                    if temp_min>v['amount']:
                        temp_min=v['amount']
            if len(cal_amount)>0:
                sum_list.append(sum(cal_amount))
                std.append(statistics.pstdev(cal_amount))
                min_list.append(temp_min)
        if len(min_list)>0:
            from_in_sum_min=sum(min_list)
        if len(min_list)<=0:
            from_in_sum_min=0
        if len(std)>0:
            from_in_min_std=min(std)
        if len(std)<=0:
            from_in_min_std=0
        if len(sum_list)>0:
            from_in_min_sum=min(sum_list)
            from_in_mean_sum=statistics.mean(sum_list)
        if len(sum_list)<=0:
            from_in_min_sum=0
            from_in_mean_sum=0
        From_In_Fea_Temp.append(from_in_sum_min)
        From_In_Fea_Temp.append(from_in_min_std)
        From_In_Fea_Temp.append(from_in_min_sum)
        From_In_Fea_Temp.append(from_in_mean_sum)
        From_In_Fea.append(From_In_Fea_Temp)

    return From_In_Fea




In [None]:
From_In_Fea=getAllFromFriendsInDirectionFeatures(legal_node_list)
print(From_In_Fea[1])