In [1]:
## Import data prep dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler

## Import modeling dependencies
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

### Read in our data

In [2]:
## Creature the filepath
filePath = os.path.join("Data", "crypto_data.csv")

## Read in as csv
startingDF = pd.read_csv(filePath)
startingDF["IsTrading"].unique()

array([ True, False])

### Preparing Data

In [3]:
## Discard all cryptocurrencies that are not being traded
isTraded = startingDF[startingDF["IsTrading"] == True] ## Turns out True is a boolean here and not a string.
isTraded.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [4]:
## Drop the IsTrading column
onlyTraded = isTraded.drop("IsTrading", axis = 1)
onlyTraded.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [5]:
## Remove all rows that have at least one null value
print(onlyTraded.count())
droppedNaNs = onlyTraded.dropna()
print("\nTotals with nulls dropped")
print(droppedNaNs.count())

Unnamed: 0         1144
CoinName           1144
Algorithm          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

Totals with nulls dropped
Unnamed: 0         685
CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64


In [6]:
## Filter for cryptocurrencies that have been mined (Total mined > 0)
print(droppedNaNs.count())
minedCoins = droppedNaNs[droppedNaNs["TotalCoinsMined"] > 0]
print("\nTotals with mined coins > 0")
print(minedCoins.count())

Unnamed: 0         685
CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

Totals with mined coins > 0
Unnamed: 0         532
CoinName           532
Algorithm          532
ProofType          532
TotalCoinsMined    532
TotalCoinSupply    532
dtype: int64


##### In order for your dataset to be comprehensible to a machine learning algorithm, 
##### its data should be numeric. Since the coin names do not contribute to the analysis of the data, 
##### delete the `CoinName` from the original dataframe.

In [7]:
## This seems a bit weird to me. How will we identify the coins afterwards?
## But the instructions demand it so here we go.
## Also the Unnamed 0 column has ticker names which also doesn't work.
## I suspect I'll have to drop this as well.
noNames = minedCoins.drop("CoinName", axis = 1)
noNames = noNames.drop("Unnamed: 0", axis = 1)
noNames.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [20]:
## Before I make my dummies, the total coins supply number should actually be a number
## So I'll convert that to a number.
print(noNames.dtypes)
noNames["TotalCoinSupply"] = noNames["TotalCoinSupply"].astype(float)
print("\nColumns with corrected data types:")
print(noNames.dtypes)

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

Columns with corrected data types:
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object


#### Make dummies.

In [23]:
## First let's check our dummy values
dummyCoins = pd.get_dummies(noNames)
print(dummyCoins.columns)
dummyCoins.head()

Index(['TotalCoinsMined', 'TotalCoinSupply',
       'Algorithm_1GB AES Pattern Search', 'Algorithm_536',
       'Algorithm_Argon2d', 'Algorithm_BLAKE256', 'Algorithm_Blake',
       'Algorithm_Blake2S', 'Algorithm_Blake2b', 'Algorithm_C11',
       'Algorithm_Cloverhash', 'Algorithm_Counterparty',
       'Algorithm_CryptoNight', 'Algorithm_CryptoNight Heavy',
       'Algorithm_CryptoNight-V7', 'Algorithm_Cryptonight-GPU',
       'Algorithm_DPoS', 'Algorithm_Dagger', 'Algorithm_Dagger-Hashimoto',
       'Algorithm_ECC 256K1', 'Algorithm_Equihash',
       'Algorithm_Equihash+Scrypt', 'Algorithm_Ethash', 'Algorithm_Exosis',
       'Algorithm_Green Protocol', 'Algorithm_Groestl', 'Algorithm_HMQ1725',
       'Algorithm_HybridScryptHash256', 'Algorithm_IMesh',
       'Algorithm_Jump Consistent Hash', 'Algorithm_Keccak',
       'Algorithm_Leased POS', 'Algorithm_Lyra2RE', 'Algorithm_Lyra2REv2',
       'Algorithm_Lyra2Z', 'Algorithm_M7 POW', 'Algorithm_Multiple',
       'Algorithm_NIST5', 'Algor

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Now we scale the data

In [24]:
## Instatiate the Scaler
scaler = StandardScaler()

## Fit the scaler
scaledData = scaler.fit_transform(dummyCoins)

## Display data
scaledData

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

(532, 98)