In [1]:
import numpy as np
import pandas as pd

In [2]:
transactions_0_4999 = pd.read_csv('transactions_0_4999.csv')

In [3]:
transactions_5000_9999 = pd.read_csv('transactions_5000_9999.csv')

In [4]:
transactions_df = transactions_0_4999.append([transactions_5000_9999])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [5]:
transactions_df[['amount_ethereum', 'amount_dollars']] = transactions_df['Amount'].str.split(' ', expand = True)

In [6]:
transactions_df

Unnamed: 0.1,Amount,From,To,Txn,Type,Unnamed: 0,punk_id,amount_ethereum,amount_dollars
0,250Ξ ($1.03M),0x983ace,,"May 10, 2021",Bid Withdrawn,0,0.0,250Ξ,($1.03M)
1,"250Ξ ($537,615)",0x983ace,,"Apr 11, 2021",Bid,1,0.0,250Ξ,"($537,615)"
2,"100Ξ ($160,973)",0xd7510a,,"Mar 03, 2021",Bid Withdrawn,2,0.0,100Ξ,"($160,973)"
3,"100Ξ ($188,897)",0xd7510a,,"Feb 20, 2021",Bid,3,0.0,100Ξ,"($188,897)"
4,"69Ξ ($25,532)",natealex,,"Sep 16, 2020",Bid Withdrawn,4,0.0,69Ξ,"($25,532)"
...,...,...,...,...,...,...,...,...,...
64595,,,0xba7ccc,"Jun 23, 2017",Claimed,64595,9984.0,,
64596,,0xba7ccc,0xcffc33,"May 05, 2021",Transfer,64596,9985.0,,
64597,0.10Ξ ($9),0x95,,"Dec 11, 2018",Bid Withdrawn,64597,9985.0,0.10Ξ,($9)
64598,0.10Ξ ($12),0x95,,"Nov 28, 2018",Bid,64598,9985.0,0.10Ξ,($12)


### Exploring values

In [7]:
transactions_df.Type.value_counts()

Offered            44368
Bid                21727
Sold               14163
Bid Withdrawn      12598
Transfer           10901
Claimed             9705
Offer Withdrawn     6642
(Wrap)              2294
(Unwrap)            2070
Name: Type, dtype: int64

In [8]:
transactions_df.isna().sum()

Amount             31612
From               60715
To                 84886
Txn                    0
Type                   0
Unnamed: 0             0
punk_id                0
amount_ethereum    31612
amount_dollars     31612
dtype: int64

##### Dropping NA's

In [9]:
transactions_df = transactions_df[transactions_df['Amount'].notna()]

In [10]:
len(transactions_df)

92856

In [11]:
transactions_df.Type.value_counts()

Offered          44368
Bid              21727
Sold             14163
Bid Withdrawn    12598
Name: Type, dtype: int64

In [12]:
transactions_df.drop(columns=['Unnamed: 0'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
values_to_replace_dollars = ['$', '(', ')', ',', '<']
values_to_replace_ethereum = ['Ξ', ',', '<']

for x in values_to_replace_dollars:
    transactions_df['amount_dollars'] = transactions_df['amount_dollars'].str.replace(x,'')
    
for x in values_to_replace_ethereum:
    transactions_df['amount_ethereum'] = transactions_df['amount_ethereum'].str.replace(x,'')
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##### Total rows with conversion Letter

In [14]:
transactions_with_letter = transactions_df[transactions_df['amount_dollars'].str.contains('[A-Za-z]')]
len(transactions_with_letter)

560

In [15]:
print('transactions with recorded conversion in dollars')
len(transactions_with_letter[transactions_with_letter['amount_dollars'].str.contains('[MTBK]')])


transactions with recorded conversion in dollars


529

In [16]:
print('transactions with recorded conversion in ethereum')
len(transactions_with_letter[transactions_with_letter['amount_ethereum'].str.contains('[MTBK]')])

transactions with recorded conversion in ethereum


230

##### Transactions with recorded conversion (K, M, B, T)

In [17]:
not_included_magnitudes = transactions_with_letter[transactions_with_letter['amount_dollars'].str.contains('[ZYP]')]
len(not_included_magnitudes)

31

In [18]:
not_included_magnitudes = transactions_with_letter[transactions_with_letter['amount_ethereum'].str.contains('[ZYP]')]
len(not_included_magnitudes)

30

##### Removing rows with really large amounts

In [19]:
transactions_df = transactions_df[~transactions_df['amount_dollars'].str.contains('[ZYP]')]

In [20]:
transactions_df = transactions_df[~transactions_df['amount_ethereum'].str.contains('[ZYP]')]

In [21]:
len(transactions_df)

92823

### Exploring values

In [22]:
transactions_df.Type.value_counts()

Offered          44335
Bid              21727
Sold             14163
Bid Withdrawn    12598
Name: Type, dtype: int64

### Split into 3 tables

### Converting Billions, Millions and Thousands

In [23]:
multipliers = {'K':1000, 'M':1000000, 'B':1000000000, 'T':1000000000000}

def string_to_int(string):
    if string[-1].isdigit(): # check if no suffix
        return float(string)
    mult = multipliers[string[-1]] # look up suffix to get multiplier
     # convert number to float, multiply by multiplier, then make int
    return int(float(string[:-1]) * mult)

testvals = ["150M", "360M", "2.6B", "3.7B"]

print(list(map(string_to_int, testvals)))

[150000000, 360000000, 2600000000, 3700000000]


In [24]:
transactions_df['amount_dollars'] = transactions_df['amount_dollars'].map(string_to_int)

In [25]:
transactions_df['amount_ethereum'] = transactions_df['amount_ethereum'].map(string_to_int)

In [26]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92823 entries, 0 to 64598
Data columns (total 8 columns):
Amount             92823 non-null object
From               48488 non-null object
To                 14592 non-null object
Txn                92823 non-null object
Type               92823 non-null object
punk_id            92823 non-null float64
amount_ethereum    92823 non-null float64
amount_dollars     92823 non-null float64
dtypes: float64(3), object(5)
memory usage: 6.4+ MB


In [28]:
len(transactions_df)

92823

In [29]:
transactions_df.amount_ethereum = transactions_df.amount_ethereum.astype('float')
transactions_df.amount_dollars = transactions_df.amount_dollars.astype('float')
transactions_df.transaction_date = transactions_df.Txn.astype('datetime64')

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
transactions_df.to_csv('transactions.csv')

### Splitting into 3 tables

##### Sales Table

In [32]:
sales = transactions_df[(transactions_df['Type']=='Sold')]
len(sales)

14163

In [33]:
sales

Unnamed: 0,Amount,From,To,Txn,Type,punk_id,amount_ethereum,amount_dollars
16,"25Ξ ($2,822)",0xf5099e,0xe08c32,"Nov 30, 2018",Sold,0.0,25.00,2822.0
24,1.60Ξ ($386),0x00d7c9,0xa0a59c,"Jul 07, 2017",Sold,0.0,1.60,386.0
31,0.98Ξ ($320),0xc352b5,0x00d7c9,"Jun 23, 2017",Sold,0.0,0.98,320.0
36,"60Ξ ($36,305)",EliteCat…,0xcf6165,"Nov 30, 2020",Sold,1.0,60.00,36305.0
62,"31Ξ ($5,155)",0xf5099e,GoWest23,"Apr 06, 2019",Sold,1.0,31.00,5155.0
...,...,...,...,...,...,...,...,...
64562,"6.20Ξ ($7,744)",niwin,0xb166dd,"Jan 27, 2021",Sold,9979.0,6.20,7744.0
64565,"5Ξ ($1,824)",pastelcr…,niwin,"Sep 28, 2020",Sold,9979.0,5.00,1824.0
64569,1Ξ ($214),shilpixe…,pastelcr…,"May 20, 2020",Sold,9979.0,1.00,214.0
64576,"29Ξ ($68,110)",Kenney,0xaa614b,"Jul 04, 2021",Sold,9980.0,29.00,68110.0


In [34]:
transactions_df.to_csv('sales.csv')

##### Bids

In [36]:
bids = transactions_df[(transactions_df['Type']=='Bid') | (transactions_df['Type']=='Bid Withdrawn')]
len(bids)

34325

In [37]:
bids

Unnamed: 0,Amount,From,To,Txn,Type,punk_id,amount_ethereum,amount_dollars
0,250Ξ ($1.03M),0x983ace,,"May 10, 2021",Bid Withdrawn,0.0,250.0,1030000.0
1,"250Ξ ($537,615)",0x983ace,,"Apr 11, 2021",Bid,0.0,250.0,537615.0
2,"100Ξ ($160,973)",0xd7510a,,"Mar 03, 2021",Bid Withdrawn,0.0,100.0,160973.0
3,"100Ξ ($188,897)",0xd7510a,,"Feb 20, 2021",Bid,0.0,100.0,188897.0
4,"69Ξ ($25,532)",natealex,,"Sep 16, 2020",Bid Withdrawn,0.0,69.0,25532.0
...,...,...,...,...,...,...,...,...
64579,0.10Ξ ($64),Kenney,,"Apr 23, 2018",Bid,9980.0,0.1,64.0
64593,0.10Ξ ($21),0x6611fe,,"Jul 10, 2017",Bid Withdrawn,9984.0,0.1,21.0
64594,0.10Ξ ($28),0x6611fe,,"Jul 02, 2017",Bid,9984.0,0.1,28.0
64597,0.10Ξ ($9),0x95,,"Dec 11, 2018",Bid Withdrawn,9985.0,0.1,9.0


In [38]:
transactions_df.to_csv('bids.csv')

##### Offers

In [39]:
offers = transactions_df[(transactions_df['Type']=='Offered')]
len(offers)

44335

In [40]:
offers

Unnamed: 0,Amount,From,To,Txn,Type,punk_id,amount_ethereum,amount_dollars
18,"100Ξ ($18,303)",,,"Sep 12, 2018",Offered,0.0,100.00,18303.0
19,"200Ξ ($138,566)",,,"Dec 16, 2017",Offered,0.0,200.00,138566.0
22,"400Ξ ($96,376)",,,"Jul 07, 2017",Offered,0.0,400.00,96376.0
38,"79.50Ξ ($46,842)",,,"Nov 25, 2020",Offered,1.0,79.50,46842.0
41,"119.10Ξ ($65,980)",,,"Nov 22, 2020",Offered,1.0,119.10,65980.0
...,...,...,...,...,...,...,...,...
64571,1Ξ ($187),,,"Apr 25, 2020",Offered,9979.0,1.00,187.0
64573,3.14Ξ ($797),,,"Jun 26, 2017",Offered,9979.0,3.14,797.0
64575,"32Ξ ($58,248)",,,"Jul 19, 2021",Offered,9980.0,32.00,58248.0
64586,"99.99Ξ ($152,787)",,,"Mar 05, 2021",Offered,9982.0,99.99,152787.0


In [41]:
transactions_df.to_csv('offers.csv')