In [1]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import re

%matplotlib notebook

In [31]:
# Load all data from folder
import os

# Location of files
folder = 'price/'
tail = '.csv'
filenames = os.listdir(folder)
    
# Get the supplied coin name
coins = list(map(lambda x: re.sub(tail, '', x).upper(), filenames))

# Load data
df_list = []
for filename in filenames:
    df_list.append(pd.read_csv(folder + filename, delimiter='\t', index_col='Date'))
    # input(filename + ' loaded.\nEnter to proceed')    # Debug for file read errors.


# Clear up each list
for index, df in enumerate(df_list):
    # Drop all columns except Date and Market Cap 
    df = df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1)
    # Convert Date column datatype to Date
    df.index = pd.to_datetime(df.index)    
    
    # Convert Market Cap column datatype to numeric
    df['Market Cap'] = df['Market Cap'].str.replace(',', '')    # Remove commas
    df = df[df['Market Cap'].str.contains('-') == False]        # Delete missing data
    df['Market Cap'] = pd.to_numeric(df['Market Cap'])
    
    # Replace previous dataframe
    df_list[index] = df 
    
del os

<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, Dec 09, 2017 to Jul 23, 2017
Data columns (total 6 columns):
Open          140 non-null float64
High          140 non-null float64
Low           140 non-null float64
Close         140 non-null float64
Volume        140 non-null object
Market Cap    140 non-null object
dtypes: float64(4), object(2)
memory usage: 7.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 1687 entries, Dec 09, 2017 to Apr 28, 2013
Data columns (total 6 columns):
Open          1687 non-null float64
High          1687 non-null float64
Low           1687 non-null float64
Close         1687 non-null float64
Volume        1687 non-null object
Market Cap    1687 non-null object
dtypes: float64(4), object(2)
memory usage: 92.3+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 1397 entries, Dec 11, 2017 to Feb 14, 2014
Data columns (total 6 columns):
Open          1397 non-null float64
High          1397 non-null float64
Low           1397 non-null float64
Close 

In [29]:
# Print df info
for index, df in enumerate(df_list):
    print('[' + str(index) + '] Ticker: ' + coins[index])
    df.info()
    print('--'*25)
    print(df.head())
    print('><'*25)

[0] Ticker: BCH
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 130 entries, 2017-12-09 to 2017-08-02
Data columns (total 1 columns):
Market Cap    130 non-null int64
dtypes: int64(1)
memory usage: 2.0 KB
--------------------------------------------------
             Market Cap
Date                   
2017-12-09  24671600000
2017-12-08  22386700000
2017-12-07  24095900000
2017-12-06  25234400000
2017-12-05  26606900000
><><><><><><><><><><><><><><><><><><><><><><><><><
[1] Ticker: BTC
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1687 entries, 2017-12-09 to 2013-04-28
Data columns (total 1 columns):
Market Cap    1687 non-null int64
dtypes: int64(1)
memory usage: 26.4 KB
--------------------------------------------------
              Market Cap
Date                    
2017-12-09  276415000000
2017-12-08  297787000000
2017-12-07  238600000000
2017-12-06  199390000000
2017-12-05  195389000000
><><><><><><><><><><><><><><><><><><><><><><><><><
[2] Ticker: DASH
<class 'panda

In [25]:
len_list = [len(df) for df in df_list]
coin_index = len_list.index(max(len_list))    # Find index of coin with oldest data

# Create new data frame with coins as columns
cc_df = pd.DataFrame([], columns=coins, index=df_list[coin_index].index)
for i in range(0, len(len_list)):
    cc_df[coins[i]] = df_list[i]['Market Cap']

In [27]:
cc_df

Unnamed: 0_level_0,BCH,BTC,ETH,IOTA,LTC,XEM,XMR,XRP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-12-09,2.467160e+10,276415000000,4.400560e+10,1.402960e+10,6910360000,6.157700e+09,4.283550e+09,9.768540e+09
2017-12-08,2.238670e+10,297787000000,4.184580e+10,1.151280e+10,5327910000,2.115340e+09,4.346000e+09,8.663460e+09
2017-12-07,2.409590e+10,238600000000,4.100760e+10,1.241620e+10,5415710000,2.442770e+09,4.291850e+09,9.011630e+09
2017-12-06,2.523440e+10,199390000000,4.448310e+10,1.059360e+10,5548860000,2.697300e+09,4.140770e+09,9.507190e+09
2017-12-05,2.660690e+10,195389000000,4.521280e+10,7.778170e+09,5644390000,2.573250e+09,3.258360e+09,9.815990e+09
2017-12-04,2.626730e+10,189172000000,4.479570e+10,5.265070e+09,5486740000,2.494140e+09,3.079850e+09,9.768480e+09
2017-12-03,2.416360e+10,185258000000,4.456050e+10,4.004200e+09,5420380000,2.266360e+09,3.137680e+09,9.869310e+09
2017-12-02,2.461050e+10,183490000000,4.485330e+10,3.948800e+09,5351330000,2.159190e+09,2.943520e+09,9.874190e+09
2017-12-01,2.325710e+10,170436000000,4.276500e+10,3.639440e+09,4758600000,2.029370e+09,2.771380e+09,9.669290e+09
2017-11-30,2.323040e+10,165537000000,4.141190e+10,3.540700e+09,4662340000,1.965590e+09,2.627490e+09,9.517940e+09


In [4]:
# Group by group_period
group_period = '7D'
months = df_list[coin_index].resample(group_period).mean()
months_df = [df.resample(group_period).mean() for df in df_list]

In [38]:
# Display 
import sys
for i in range(3,30):
    out_str = 'Parsing' + '.'*(3%i)
    sys.stdout.write(out_str)
    sys.stdout.flush()

ParsingParsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...Parsing...

In [5]:
# Create new df with coins as columns
len_list = [len(df) for df in df_list]
coin_index = len_list.index(max(len_list))

cc_df = pd.DataFrame([], columns=coins, index=months_df[coin_index].index)

for index, df in enumerate(months_df):
    cc_df[coins[index]] = df['Market Cap']

In [6]:
# Load total market cap data
market_cap_df = pd.read_csv('data/total_market_cap.csv', index_col='Date')
market_cap_df.index = pd.to_datetime(market_cap_df.index)
cap_resampled_df = market_cap_df.resample(group_period).mean()

# Add total market cap to cc_df as Total
cc_df['Total'] = cap_resampled_df['Total Market Cap']

In [7]:
# Reverse data, newest first
cc_df = cc_df[::-1]

In [None]:
# Plot data for Dec 2017, 0th row      ## TODO SEVEN DAYS
from matplotlib.dates import *

row_data = cc_df.iloc[0][coins]
row_date = row_data.name.strftime('%b-%Y')

ax = sns.barplot(coins, row_data)
#plt.title('Market Cap')
#plt.show()

In [8]:
cc_df

Unnamed: 0_level_0,BCH,BTC,ETH,IOTA,LTC,XEM,XMR,XRP,Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-03,,2.260016e+11,,,5.679193e+09,,,9.486371e+09,333819550755
2017-11-26,,1.649920e+11,,,4.908079e+09,,,9.962297e+09,283044222782
2017-11-19,,1.350674e+11,,,3.901186e+09,,,9.137703e+09,233808481293
2017-11-12,,1.151036e+11,,,3.441806e+09,,,8.184229e+09,200964455665
2017-11-05,,1.195684e+11,,,3.163411e+09,,,8.053820e+09,199892277706
2017-10-29,,1.082057e+11,,,2.972110e+09,,,7.794223e+09,170353228043
2017-10-22,,9.734221e+10,,,3.008546e+09,,,7.859583e+09,171118712167
2017-10-15,,9.540121e+10,,,3.320006e+09,,,9.092029e+09,174076873613
2017-10-08,,8.200486e+10,,,2.874123e+09,,,9.950659e+09,150083109278
2017-10-01,,7.207339e+10,,,2.816950e+09,,,8.210754e+09,146923833358


In [None]:
from matplotlib.dates import DateFormatter

ax = cc_df.plot(x=cc_df.index, y=coins, kind='bar', figsize=(8,4))
plt.ylabel('Market Cap')
plt.xlabel('Date')

drange = pd.date_range(min(cc_df.index), max(cc_df.index) + pd.offsets.MonthEnd(), freq='M')
ax.get_xaxis().set_ticks(drange)
ax.get_xaxis().set_major_formatter(DateFormatter('%Y-%b'))
ax.xaxis_date()
ax.autoscale()
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

In [None]:
# Plot Benford's Law
def benfords_prob(d):
    return np.log10(1+1/d)

benfords_10 = [benfords_prob(i) for i in range(1, 11)]

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
sns.barplot([x for x in range(1,11)], benfords_10, palette='Blues_r')

# Add percentage text to top of bar
for r in ax.patches:
    height = r.get_height()
    ax.text(r.get_x()+r.get_width()/2.,    # x-pos of text
            height + 0.01,                 # y-pos of text
            '{:3.1f}%'.format(height*100), # text string
            ha="center") 
    
# Change y limits and display as percentages
ax.set_ylim(0, 0.35)  
vals = ax.get_yticks()
ax.set_yticklabels(['{:2.0f}%'.format(x*100) for x in vals])
ax.autoscale_view()

plt.title('Benford Distribution')
plt.show()