### Validating transactions

We remove fraudulent blocks from the blockchain data to ensure we only have valid inputs and outputs

In [1]:
import pandas as pd

transactions = pd.read_csv("transactions.csv", index_col=0, names=["block_id", "is_coinbase"])
transactions.index.name = "tr_id"

In [2]:
issues = [52534,   # 2 coinbases! have to drop it
          11181,   # tries to mint more 10 satoshis that are not in fees...
          12042,   # Double spend attack for output 7998, spent at input 521...
          15567, # Tries to spend an output that does not exist (yet) 
          30223,   # Tries to double spend the same output (21928) in the same TX
          56565,  # Tries to double spend output 65403
          72902, # Outputs higher (10 satoshis) then input
          75047, # Tries to have a negative 50 BTC output...
          79885,  # Tries to spend an output from a block with 2 coinbases...
          88755,  # Tries to spend the output that it creates to create it
          96607]  # Tries to spend an output that has not been created yet.
    
print("Will remove {} blocks".format(len(issues)))

problematic_blocks = transactions.block_id.isin(issues)
print("Removing {} transactions".format(sum(problematic_blocks)))
tx_to_remove = transactions[transactions.block_id.isin(issues)].index.values
print(tx_to_remove)
transactions =  transactions[~problematic_blocks] # 2 coinbases for 52354, we will have to drop it
# Now ce can simply deal with transactions! 
# Let's also re


# Let's add info on outputs and inputs to transactions
inputs = pd.read_csv("inputs.csv", index_col=0, names=["tr_id", "output_id"])
outputs = pd.read_csv("outputs.csv", index_col=0, names=["tr_id", "pk_id" ,"value"])

# Let's also drop those that refer to to the 52354 here.
inputs = inputs[~inputs.tr_id.isin(tx_to_remove)]
outputs = outputs[~outputs.tr_id.isin(tx_to_remove)]


# Check for two coinbases !
two_coinbase = sum(transactions[transactions.is_coinbase == 1].block_id.value_counts() > 1)
assert(not two_coinbase)

Will remove 11 blocks
Removing 27 transactions
[ 11281  11282  12151  12152  15697  15698  30445  30446  56851  56852
  56853  61841  61842  61843  61844  61845 100928 100929 105279 105280
 105281 114915 114916 137236 137237 204750 204751]


In [3]:
# Let's add columns for inputs and outputs in the transaction table

input_summary = {tr_id: [] for tr_id in transactions.index}
output_summary =  {tr_id: [] for tr_id in transactions.index}

for id, tr_id in inputs.tr_id.items():
    input_summary[tr_id].append(id)
for id, tr_id in outputs.tr_id.items():
    output_summary[tr_id].append(id)
    
transactions["inputs"] = pd.Series(input_summary)
transactions["outputs"] = pd.Series(output_summary)

In [4]:
def handle_coinbase(transaction):
    # Perform basic checks
    assert(not transaction.inputs)
    try:
        assert(len(transaction.outputs) > 0)
    except:
        print(transaction)
        print(1/0)

    coinbase = outputs.loc[transaction.outputs]
    if coinbase.shape[0] > 1:
        try:
            assert(coinbase.value.sum() == coinbase_value)
        except:
            to_check.append(transaction.block_id)
        for output, pk, value in zip(coinbase.index, coinbase.pk_id, coinbase.value):
            utxo[output] = [pk, value]
        return
    
    try:
        assert(int(coinbase["value"]) == coinbase_value)
    except:
        to_check.append(transaction.block_id)
    try:
        pk_id = int(coinbase.pk_id)
    except:
        print(transaction)
        print(coinbase)
        print(1/0)
    output_id = coinbase.index[0]
    utxo[output_id] = [pk_id, int(coinbase["value"])]

        
def handle_transaction(transaction):
    
    assert(transaction.inputs)
    assert(transaction.outputs)
    
    all_outputs = outputs.loc[transaction.outputs]
    value_output = all_outputs.value.sum()
    
    all_inputs = inputs.loc[transaction.inputs].output_id
    try:
        value_input = sum(utxo[i][1] for i in all_inputs)
    except:
        print(inputs.loc[transaction.inputs])
        print(transaction)
        print(1/0)
    
    try:
        assert(value_input == value_output)
    except:
        diff = value_input - value_output
        if not transaction.block_id in to_check:
            print(transaction)
            print(diff)
            print(1/0)
        if diff < 0:
            print(transaction)
            print(1/0)
    
    # We can now do the transaction!
    for i in all_inputs:
        try:
            utxo.pop(i)
        except:
            print(all_inputs)
            print(transaction)
            print(1/0)
    
    for output_id, pk_id, value in zip(all_outputs.index, all_outputs.pk_id, all_outputs.value):
        utxo[output_id] = [pk_id, value]

In [5]:
utxo = {} # mapping UTXO to [owner, value]
coinbase_value = 50*10**8
to_check = [] # we check fees externally as its a block level feature

for i, coinbase in enumerate(transactions.is_coinbase):
    if (i+1) % 10**4 == 0:
        print("Handled {} transactions".format(i))
    if coinbase:
        handle_coinbase(transactions.iloc[i])
    else:
        handle_transaction(transactions.iloc[i])

Handled 9999 transactions
Handled 19999 transactions
Handled 29999 transactions
Handled 39999 transactions
Handled 49999 transactions
Handled 59999 transactions
Handled 69999 transactions
Handled 79999 transactions
Handled 89999 transactions
Handled 99999 transactions
Handled 109999 transactions
Handled 119999 transactions
Handled 129999 transactions
Handled 139999 transactions
Handled 149999 transactions
Handled 159999 transactions
Handled 169999 transactions
Handled 179999 transactions
Handled 189999 transactions
Handled 199999 transactions
Handled 209999 transactions


In [6]:
len(utxo)

71887

In [7]:
val = [i[1] for i in utxo.values()]
max(val)/10**8

90000.0

In [8]:
pk_ids = pd.Series(range(outputs.pk_id.nunique()), index=outputs.pk_id.unique())

for _, (input_, output_) in transactions[["inputs", "outputs"]].iterrows():
    if not (_ + 1) % 10000:
        print(_ + 1)
        print("Number distinct values: ", len(set(pk_ids)))
    
    if len(input_) > 1 or (len(output_) == 1 and len(input_)):
        pk_input = list(outputs.loc[inputs.loc[input_].output_id].pk_id)
        pk_output = list(outputs.loc[output_].pk_id)
    else:
        continue

    if len(input_) > 1:
        for pk in pk_input[1:]:
            clus_1, clus_2 = pk_ids.loc[pk_input[0]], pk_ids.loc[pk]
            if clus_1 != clus_2:
                pk_ids = pk_ids.replace(clus_1, clus_2) 
                
    if len(output_) == 1 and len(input_):
        for i in range(len(pk_input)):
            clus_1, clus_2 = pk_ids.loc[pk_input[0]], pk_ids.loc[pk_output[0]]
            if clus_1 != clus_2: 
                pk_ids = pk_ids.replace(clus_1, clus_2)

10000
Number distinct values:  174120
20000
Number distinct values:  173708
30000
Number distinct values:  171941
40000
Number distinct values:  169877
50000
Number distinct values:  166095
60000
Number distinct values:  161648
70000
Number distinct values:  156463
80000
Number distinct values:  153179
90000
Number distinct values:  150071
100000
Number distinct values:  146941
110000
Number distinct values:  141077
120000
Number distinct values:  134572
130000
Number distinct values:  126761
140000
Number distinct values:  120950
150000
Number distinct values:  118104
160000
Number distinct values:  117374
170000
Number distinct values:  117172
180000
Number distinct values:  117007
190000
Number distinct values:  116284
200000
Number distinct values:  115126
210000
Number distinct values:  109721


In [10]:
possessions_check = {}
for _, (pk, val) in outputs[~outputs.index.isin(inputs.output_id)][["pk_id", "value"]].iterrows():
    uf_pkid = pk_ids.loc[pk]
    possessions_check[uf_pkid] = possessions_check.get(uf_pkid, 0) + val

In [11]:
max(possessions_check.values())/10**8

998547.75176268001

In [12]:
entity = max(possessions_check, key=possessions_check.get)
entity

174664

In [13]:
controlled = []
for key, val in pk_ids.items():
    if val == entity:
        controlled.append(key)

In [15]:
min(controlled)

172

In [69]:
senders = []
for j, tx in enumerate(outputs[outputs.pk_id.isin(controlled)].tr_id):
    if not (j+1)%10**4:
        print(j+1)
    origin = inputs.loc[transactions.loc[tx].inputs].output_id
    tx_series = outputs.loc[origin]
    if (~tx_series.pk_id.isin(controlled)).all():
        senders.append([tx, tx_series.value.sum()])

10000
20000
30000
40000
50000
60000
70000
80000


In [70]:
senders = pd.Series([s[1] for s in senders], index=[s[0] for s in senders])

In [72]:
print(senders.max()/10**8)
print(senders.argmax())

49980.0
98122


In [73]:
transactions.loc[98122]

block_id                  71562
is_coinbase                   0
inputs                  [56287]
outputs        [114341, 114342]
Name: 98122, dtype: object