In [1]:
import json
import pandas as pd
from datetime import datetime
from collections import defaultdict
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
def extract_features(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    wallet_data = defaultdict(list)
    for tx in data:
        wallet_data[tx["userWallet"]].append(tx)

    features = []
    for wallet, txs in wallet_data.items():
        deposits = [tx for tx in txs if tx['action'] == 'deposit']
        if not deposits:
            continue
        amounts = [int(tx['actionData']['amount']) for tx in deposits]
        asset_symbols = [tx['actionData']['assetSymbol'] for tx in deposits]
        timestamps = [int(tx['timestamp']) for tx in deposits]
        networks = list(set(tx['network'] for tx in deposits))

        total_amount = sum(amounts)
        avg_amount = np.mean(amounts)
        max_amount = max(amounts)
        unique_assets = len(set(asset_symbols))
        tx_count = len(deposits)
        first_ts = min(timestamps)
        last_ts = max(timestamps)
        time_span = max(1, (last_ts - first_ts) / (60 * 60 * 24))
        tx_per_day = tx_count / time_span
        variance = np.var(np.diff(sorted(timestamps))) if len(timestamps) > 1 else 0
        net_diversity = len(networks)

        features.append({
            "wallet": wallet,
            "total_amount": total_amount,
            "avg_amount": avg_amount,
            "max_amount": max_amount,
            "unique_assets": unique_assets,
            "tx_count": tx_count,
            "tx_per_day": tx_per_day,
            "variance": variance,
            "network_diversity": net_diversity
        })

    return pd.DataFrame(features)

In [3]:
def generate_pseudo_labels(df):
    # Generate pseudo scores using heuristic
    df["score"] = (
        MinMaxScaler().fit_transform(df[["total_amount"]])[:, 0] * 0.3 +
        MinMaxScaler().fit_transform(df[["tx_per_day"]])[:, 0] * 0.2 +
        MinMaxScaler().fit_transform(df[["max_amount"]])[:, 0] * 0.2 +
        MinMaxScaler().fit_transform(1 / (df["unique_assets"] + 1).values.reshape(-1, 1))[:, 0] * 0.1 +
        MinMaxScaler().fit_transform(1 / (df["variance"] + 1).values.reshape(-1, 1))[:, 0] * 0.1 +
        MinMaxScaler().fit_transform(df[["network_diversity"]])[:, 0] * 0.1
    ) * 1000
    df["score"] = df["score"].astype(int)
    return df

In [4]:
def train_model(df):
    X = df.drop(columns=["wallet", "score"])
    y = df["score"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Model RMSE: {rmse:.2f}")

    df["predicted_score"] = model.predict(X)
    return df[["wallet", "predicted_score"]].sort_values(by="predicted_score", ascending=False)

In [7]:
json_path = r"D:\Downloads\user-wallet-transactions.json"  # ← replace with your actual filename

In [8]:
df_feat = extract_features(json_path)
df_feat

Unnamed: 0,wallet,total_amount,avg_amount,max_amount,unique_assets,tx_count,tx_per_day,variance,network_diversity
0,0x00000000001accfa9cef68cf5371a23025b6d4b6,2000000000,2.000000e+09,2000000000,1,1,1.000000,0.000000e+00,1
1,0x000000000051d07a4fb3bd10121a343d85818da6,145000000000000000000,1.450000e+20,145000000000000000000,1,1,1.000000,0.000000e+00,1
2,0x000000000096026fb41fc39f9875d164bd82e2dc,5000000000000000,2.500000e+15,4000000000000000,1,2,0.264057,0.000000e+00,1
3,0x0000000002032370b971dabd36d72f3e5a7bf1ee,94593658147077093369063,3.783746e+20,16000000000000000000000,6,250,2.024664,3.425642e+09,1
4,0x000000000a38444e0a6e37d3b630d7e855a7cb13,10609617267610298671814,1.515660e+21,6640000000000000000000,3,7,0.191402,5.485174e+11,1
...,...,...,...,...,...,...,...,...,...
3323,0x06185db89f5c8ef5e5a0abc95c5cb69b54c33f30,98,9.800000e+01,98,1,1,1.000000,0.000000e+00,1
3324,0x06189341e523a52ae10c4bd4a7774371fac1b249,9664407909841847976,2.416102e+18,6064407909831906725,3,4,4.000000,1.113491e+08,1
3325,0x0618c450370822c5cb25ec143a3008230d8e2c12,14105464317308176032,1.410546e+19,14105464317308176032,1,1,1.000000,0.000000e+00,1
3326,0x0618e615c8c9b5efc8c8eadc68be7182bdc455e2,1725242181206597020998,1.327109e+20,305000000000000000000,6,13,0.518109,3.941515e+10,1


In [9]:
df_labeled = generate_pseudo_labels(df_feat)
df_labeled

Unnamed: 0,wallet,total_amount,avg_amount,max_amount,unique_assets,tx_count,tx_per_day,variance,network_diversity,score
0,0x00000000001accfa9cef68cf5371a23025b6d4b6,2000000000,2.000000e+09,2000000000,1,1,1.000000,0.000000e+00,1,208
1,0x000000000051d07a4fb3bd10121a343d85818da6,145000000000000000000,1.450000e+20,145000000000000000000,1,1,1.000000,0.000000e+00,1,208
2,0x000000000096026fb41fc39f9875d164bd82e2dc,5000000000000000,2.500000e+15,4000000000000000,1,2,0.264057,0.000000e+00,1,202
3,0x0000000002032370b971dabd36d72f3e5a7bf1ee,94593658147077093369063,3.783746e+20,16000000000000000000000,6,250,2.024664,3.425642e+09,1,25
4,0x000000000a38444e0a6e37d3b630d7e855a7cb13,10609617267610298671814,1.515660e+21,6640000000000000000000,3,7,0.191402,5.485174e+11,1,37
...,...,...,...,...,...,...,...,...,...,...
3323,0x06185db89f5c8ef5e5a0abc95c5cb69b54c33f30,98,9.800000e+01,98,1,1,1.000000,0.000000e+00,1,208
3324,0x06189341e523a52ae10c4bd4a7774371fac1b249,9664407909841847976,2.416102e+18,6064407909831906725,3,4,4.000000,1.113491e+08,1,69
3325,0x0618c450370822c5cb25ec143a3008230d8e2c12,14105464317308176032,1.410546e+19,14105464317308176032,1,1,1.000000,0.000000e+00,1,208
3326,0x0618e615c8c9b5efc8c8eadc68be7182bdc455e2,1725242181206597020998,1.327109e+20,305000000000000000000,6,13,0.518109,3.941515e+10,1,12


In [10]:
df_scores = train_model(df_labeled)
print(df_scores)

Model RMSE: 11.34
                                          wallet  predicted_score
129   0x0034baeeb160a5f1032b6d124d3e87cc94d74e62           380.96
2584  0x04b5762ee57e2fa1cc1dd2e13d284d0284f2d41e           235.88
53    0x000f908720f9ce164d90f4fc89459fab4442907f           217.69
2301  0x0423422bf2e02ac5c5bff7bb34463b9dc372946c           216.00
3160  0x05c8f94d48672c9304d0774e164bbfb03beb78ae           216.00
...                                          ...              ...
627   0x010d23b534006f4e4f174a7530d7d9e3c131732a             3.63
2586  0x04b5c5d08ddef275535c5a95a62f803fc1cd3dba             3.36
2747  0x04fdac80b3ca0b9066bd275f176dc807e3908a8e             3.19
2502  0x048aaa70a3e351c9e3b487ba78fa03efbf062067             2.33
2007  0x038ca2dcaf884d02dd1deae89bf23a7831fb1179             1.74

[3328 rows x 2 columns]


