In [15]:
import os
import pandas as pd
from google.cloud import bigquery
from IPython.display import clear_output
from gensim.models import Word2Vec, tfidfmodel
from collections import defaultdict
import numpy as np
import logging

In [16]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'upwork_proj.json'


In [17]:
client = bigquery.Client()


ETHEREUM_COMMON = 'blockchain-etl.ethereum_common'
TRANSFERSINGLE = f'{ETHEREUM_COMMON}.All_event_TransferSingle'
TRANSFERBATCH = f'{ETHEREUM_COMMON}.All_event_TransferBatch'
PARALLEL_ALPHA_CONTRACT = f'0x76BE3b62873462d2142405439777e971754E8E77'

df = client.query(f"""
SELECT ts.to as address, ts.id as item, ts.block_timestamp as timestamp
FROM {TRANSFERSINGLE} as ts
WHERE LOWER(contract_address) like LOWER('{PARALLEL_ALPHA_CONTRACT}')
UNION ALL
SELECT tb.to as address, item, tb.block_timestamp as timestamp
FROM {TRANSFERBATCH} as tb
CROSS JOIN UNNEST(SPLIT(tb.ids, ',')) as item
WHERE LOWER(contract_address) like LOWER('{PARALLEL_ALPHA_CONTRACT}')
""").to_dataframe()


In [18]:
seqs = df.sort_values('timestamp').groupby('address')['item'].apply(list).to_frame(name='items').reset_index().set_index('address')
seqs = seqs.drop('0x0000000000000000000000000000000000000000')
seqs['X'] = seqs['items'].apply(lambda l: [a for a in l])  # DEEP copy
seqs['Y'] = seqs['X'].apply(lambda x: x.pop())
# seqs['include'] = seqs['X'].apply(lambda x: len(x) > 0)
seqs['length'] = seqs['items'].map(len)
print(f'{len(seqs)} sequences from {len(seqs.index.unique())} addresses with an average length of {seqs["items"].apply(len).mean():.2f}.')
print(f'{len(seqs)} sequences, avg length {seqs["items"].apply(len).mean():.2f}, avg X {seqs["X"].apply(len).mean():.2f}.  {len(seqs[seqs["length"]>1])} valid training samples.')

65551 sequences from 65551 addresses with an average length of 11.85.
65551 sequences, avg length 11.85, avg X 10.85.  29396 valid training samples.


In [19]:
import random
import numpy as np

def read_data(path):
    '''read sequences from '''
    with open(path, 'r') as f:
        seqs= f.read().split('\n')
    return seqs

def filter_sequence(seqs, min_length=3):
    '''filter a sequnce on minimum length'''
    filtered_seqs=[]
    for val in seqs:
        size=val.split(" ")
        size.remove("")
        if len(size)>= min_length:
            filtered_seqs.append(val)
    return filtered_seqs

def train_test_split(data, train_ratio=0.25):
    '''shuffle and split data into train test split'''
    random.shuffle(data)
    train_range= int(len(data)*train_ratio)
    return data[:train_range], data[train_range:]

def merge_seq_arr(arr):
    '''Merge array of items to space seperated sequence'''
    seq=""
    for item in arr: 
         seq+=item + " "
    return seq
    
def split_fixed_length_sequence(data, max_length=8):
    '''split sequences on fixed lengths to increse data points'''
    for seq in data: 
        if len(seq)> max_length:
            pass

        
def chunks(lst, n=5):
    """return successive n-sized chunks from lst."""
    slices=[]
#     lst = lst.split(" ")
    for i in range(n, len(lst)+n, n):
        slices.append(merge_seq_arr(lst[i-n:i]))

    return slices

def extend_data_points(data):
    """Breakes bigger sequences into chunks for length defined in chunks function"""
    extended_data=[]
    for seqs in data:
        extended_data+=chunks(seqs)
    return extended_data

def add_padding_left(data, length):
    padded_data=[]
    for seq in data:
        padded_data.append(pad_list(seq.split(" "),length))
    return padded_data
    
def pad_list(s, n):
    s = [string for string in s if string != ""]
#     print("inside pad function")
#     pdb.set_trace()
    return ['PAD'] * (n - len(s)) + s     
    


In [20]:
from IPython.display import clear_output
from gensim.models import Word2Vec
from collections import defaultdict
import numpy as np
import logging

class W2Vmodel:
    PADTOK = 'PAD'
    def __init__(self, train_data, vector_size=10, window=1, sg=1, min_count=1, pad=True):
        self.pad = pad
        self.top_first_item = list(train_data['items'].map(lambda l:l[0]).value_counts().index)[:100]
        self.model = Word2Vec(sentences=[self.preprocess(seq) for seq in train_data['items']], min_count=min_count, vector_size=vector_size, window=window, sg=sg)
        logging.getLogger("gensim").setLevel(logging.WARNING)
        clear_output()
        
    def preprocess(self, orig_seq):
        s = [a.lower() for a in orig_seq]
        if self.pad:
            return [self.PADTOK] + s
        else:
            return s

    def embed(self, seq):
        return self.model.wv[seq]
    
    def predict_next(self, seq):
        # Default is top_first_item if seq is empty
        prediction = [(v, 0) for v in self.top_first_item]
        if len(seq) > 0:
            prediction = self.model.predict_output_word(self.preprocess(seq), topn=10)
        return prediction
    
    def predict_nearest(self, seq):
        prediction = [(v, 0) for v in self.top_first_item]
        if len(seq) > 0:
            emb = self.toks_to_embedding(seq)
            prediction = self.nearest_token(emb)
        return prediction
    
    def toks_to_embedding(self, toks):
        # TODO: Consider ignoring PAD here
        token_embs = [self.token_embedding(t) for t in self.preprocess(toks)]
        return np.mean(token_embs, axis=0)

    def nearest_token(self, query_emb):
        return self.model.wv.most_similar(query_emb)

    def token_embedding(self, token_id):
        return self.model.wv[token_id]



In [21]:
included_data = seqs[seqs['length']>0]

word2vec_model= W2Vmodel(included_data, vector_size=20, window=1, sg=1, min_count=1, pad=True)

filter_data= seqs[seqs['length']>2]['items']
filter_data= extend_data_points(filter_data)
filter_data= filter_sequence(filter_data, min_length=5)
padded_filter_data= add_padding_left(filter_data,5)


22279


In [23]:
X, Y=[],[]
X_org,Y_org=[],[]


for seq in padded_filter_data:
    x,y= seq[0:4],seq[4:5]
    X_org.append(x)
    Y_org.append(y)
    X.append(word2vec_model.embed(x))
    Y.append(word2vec_model.embed(y))

X,Y=np.array(X), np.array(Y)
X_org, Y_org=np.array(X_org), np.array(Y_org)


print(X.shape,Y.shape)
np.save('data/X.data', X)
np.save('data/y.data', Y)
np.save('data/X_org.data', X_org)
np.save('data/y_org.data', Y_org)
word2vec_model.model.save("data/word2vec.model")


(135597, 4, 20) (135597, 1, 20)
