Loading Data

In [2]:
import pandas as pd
import numpy as np
import plotly_express as px
import requests
from datasist.structdata import detect_outliers
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import time

In [None]:
df = pd.read_csv('./crypto_tradinds.csv', low_memory=False)
df

In [None]:
msk = df['crypto_name'] == 'Bitcoin'
df = df[msk]

Exploring Data

In [None]:
df.info()

Fixing the Dtype of trade_date column

In [157]:
df['trade_date'] = pd.to_datetime(df['trade_date'])

In [None]:
def GDP(date):
    url = f"https://api.stlouisfed.org/fred/series/observations?series_id=GDP&observation_start={date.strftime('%Y-%m-%d')}&observation_end={date.strftime('%Y-%m-%d')}&api_key=ac97eb558e2eb7f3d76998aef67e159f"
    time.sleep(1)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return float(response.text[response.text.find("value=") + 7:response.text.find("/>\n</observations>") - 1])
        else:
            print('Failed to get data. Status code:', response.status_code)
            return np.nan
    except Exception as e:
        print('An error Occured:', e)
        return np.nan

In [None]:
def Unemployment_Rate(date):
    url = f"https://api.stlouisfed.org/fred/series/observations?series_id=UNRATE&observation_start={date.strftime('%Y-%m-%d')}&observation_end={date.strftime('%Y-%m-%d')}&api_key=ac97eb558e2eb7f3d76998aef67e159f"
    time.sleep(1)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return float(response.text[response.text.find("value=") + 7:response.text.find("/>\n</observations>") - 1])
        else:
            print('Failed to get data. Status code:', response.status_code)
            return np.nan
    except Exception as e:
        print('An error Occured:', e)
        return np.nan

In [None]:
def CPI(date):
    url = f"https://api.stlouisfed.org/fred/series/observations?series_id=CPIAUCSL&observation_start={date.strftime('%Y-%m-%d')}&observation_end={date.strftime('%Y-%m-%d')}&api_key=ac97eb558e2eb7f3d76998aef67e159f"
    time.sleep(1)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return float(response.text[response.text.find("value=") + 7:response.text.find("/>\n</observations>") - 1])
        else:
            print('Failed to get data. Status code:', response.status_code)
            return np.nan
    except Exception as e:
        print('An error Occured:', e)
        return np.nan

In [None]:
def PPI(date):
    url = f"https://api.stlouisfed.org/fred/series/observations?series_id=PPIACO&observation_start={date.strftime('%Y-%m-%d')}&observation_end={date.strftime('%Y-%m-%d')}&api_key=ac97eb558e2eb7f3d76998aef67e159f"
    time.sleep(1)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return float(response.text[response.text.find("value=") + 7:response.text.find("/>\n</observations>") - 1])
        else:
            print('Failed to get data. Status code:', response.status_code)
            return np.nan
    except Exception as e:
        print('An error Occured:', e)
        return np.nan

In [None]:
def CCI(date):
    url = f"https://api.stlouisfed.org/fred/series/observations?series_id=UMCSENT&observation_start={date.strftime('%Y-%m-%d')}&observation_end={date.strftime('%Y-%m-%d')}&api_key=ac97eb558e2eb7f3d76998aef67e159f"
    time.sleep(1)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return float(response.text[response.text.find("value=") + 7:response.text.find("/>\n</observations>") - 1])
        else:
            print('Failed to get data. Status code:', response.status_code)
            return np.nan
    except Exception as e:
        print('An error Occured:', e)
        return np.nan

In [None]:
df['Unemployment_Rate'] = df['trade_date'].apply(Unemployment_Rate)

In [None]:
df['CPI'] = df['trade_date'].apply(CPI)

In [None]:
df['CCI'] = df['trade_date'].apply(CCI)

In [None]:
df['PPI'] = df['trade_date'].apply(PPI)

In [None]:
df['GDP'] = df['trade_date'].apply(GDP)

Saving Updated Data

In [None]:
df.to_csv('Updated.csv', index=False)

Loading Data

In [5]:
df = pd.read_csv('Updated.csv')

Exploring Data

In [3]:
df.head(5)

Unnamed: 0,trade_date,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,BTC_price_change_1_day,crypto_name,crypto_type,...,site_url,github_url,minable,platform_name,industry_name,Unemployment_Rate,CPI,CCI,PPI,GDP
0,2016-01-01,36278900.0,434.33,1.0,6529300000.0,0.0,0.0,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
1,2016-01-02,30096600.0,433.44,1.0,6517390000.0,-0.001824,-0.002049,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
2,2016-01-03,39633800.0,430.01,1.0,6467430000.0,-0.007666,-0.007913,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
3,2016-01-04,38477500.0,433.09,1.0,6515713000.0,0.007466,0.007163,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
4,2016-01-05,34522600.0,431.96,1.0,6500393000.0,-0.002351,-0.002609,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   trade_date                   1746 non-null   object 
 1   volume                       1746 non-null   float64
 2   price_usd                    1746 non-null   float64
 3   price_btc                    1746 non-null   float64
 4   market_cap                   1746 non-null   float64
 5   capitalization_change_1_day  1746 non-null   float64
 6   USD_price_change_1_day       1746 non-null   float64
 7   BTC_price_change_1_day       1746 non-null   float64
 8   crypto_name                  1746 non-null   object 
 9   crypto_type                  1746 non-null   float64
 10  ticker                       1746 non-null   object 
 11  max_supply                   1746 non-null   float64
 12  site_url                     1746 non-null   object 
 13  github_url        

In [25]:
df.describe()

Unnamed: 0,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,BTC_price_change_1_day,crypto_type,max_supply,minable,Unemployment_Rate,CPI,CCI,PPI,GDP
count,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1743.0,1717.0
mean,10411890000.0,5631.871529,0.999427,98822150000.0,0.002966,0.002847,0.0,0.0,21000000.0,1.0,4.916667,249.671468,93.409794,194.840275,20301.669049
std,12507780000.0,4055.940993,0.023932,72384760000.0,0.039423,0.039418,0.0,0.0,0.0,0.0,2.252823,6.820132,7.433065,6.39352,1092.713888
min,28514000.0,364.33,0.0,5496598000.0,-0.371631,-0.371695,0.0,0.0,21000000.0,1.0,3.5,237.336,71.8,181.3,18525.933
25%,277701200.0,1126.8975,1.0,18319710000.0,-0.011063,-0.011156,0.0,0.0,21000000.0,1.0,3.8,244.004,91.2,190.7,19280.084
50%,5004655000.0,6252.71,1.0,108355600000.0,0.002218,0.002065,0.0,0.0,21000000.0,1.0,4.3,250.792,96.2,195.5,20328.553
75%,17266850000.0,8833.975,1.0,157790800000.0,0.017371,0.017315,0.0,0.0,21000000.0,1.0,4.8,255.848,98.3,199.3,21384.775
max,74156770000.0,19497.4,1.0,326502500000.0,0.252614,0.252472,0.0,0.0,21000000.0,1.0,14.8,260.895,101.4,204.6,22024.502


Fixing the Dtype of trade_date column

In [6]:
df['trade_date'] = pd.to_datetime(df['trade_date'])

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   trade_date                   1746 non-null   datetime64[ns]
 1   volume                       1746 non-null   float64       
 2   price_usd                    1746 non-null   float64       
 3   price_btc                    1746 non-null   float64       
 4   market_cap                   1746 non-null   float64       
 5   capitalization_change_1_day  1746 non-null   float64       
 6   USD_price_change_1_day       1746 non-null   float64       
 7   BTC_price_change_1_day       1746 non-null   float64       
 8   crypto_name                  1746 non-null   object        
 9   crypto_type                  1746 non-null   float64       
 10  ticker                       1746 non-null   object        
 11  max_supply                   1746 non-null 

Dealling with NULL Values

In [7]:
df.sort_values(by='trade_date')

Unnamed: 0,trade_date,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,BTC_price_change_1_day,crypto_name,crypto_type,...,site_url,github_url,minable,platform_name,industry_name,Unemployment_Rate,CPI,CCI,PPI,GDP
0,2016-01-01,3.627890e+07,434.33,1.0,6.529300e+09,0.000000,0.000000,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
1,2016-01-02,3.009660e+07,433.44,1.0,6.517390e+09,-0.001824,-0.002049,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
2,2016-01-03,3.963380e+07,430.01,1.0,6.467430e+09,-0.007666,-0.007913,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
3,2016-01-04,3.847750e+07,433.09,1.0,6.515713e+09,0.007466,0.007163,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
4,2016-01-05,3.452260e+07,431.96,1.0,6.500393e+09,-0.002351,-0.002609,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1741,2020-10-29,5.649950e+10,13437.88,1.0,2.489953e+11,0.012588,0.012553,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1742,2020-10-30,3.058149e+10,13546.52,1.0,2.510182e+11,0.008124,0.008085,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1743,2020-10-31,3.030646e+10,13780.99,1.0,2.553721e+11,0.017345,0.017309,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1744,2020-11-01,2.445386e+10,13737.11,1.0,2.545698e+11,-0.003142,-0.003184,0.0,Bitcoin,0.0,...,https://bitcoin.org/,https://github.com/bitcoin/,1.0,XRP,Proof of Work (PoW),6.7,260.895,76.9,198.3,22024.502


Calculating NULL Percentage

In [8]:
df.isnull().mean()*100

trade_date                     0.000000
volume                         0.000000
price_usd                      0.000000
price_btc                      0.000000
market_cap                     0.000000
capitalization_change_1_day    0.000000
USD_price_change_1_day         0.000000
BTC_price_change_1_day         0.000000
crypto_name                    0.000000
crypto_type                    0.000000
ticker                         0.000000
max_supply                     0.000000
site_url                       0.000000
github_url                     0.000000
minable                        0.000000
platform_name                  0.000000
industry_name                  0.000000
Unemployment_Rate              0.000000
CPI                            0.000000
CCI                            0.000000
PPI                            0.171821
GDP                            1.660939
dtype: float64

Using KNN imputer to handel Missing Values

In [9]:
knn_imputer = KNNImputer(n_neighbors = 5)
df[['PPI', 'GDP']] = knn_imputer.fit_transform(df[['PPI', 'GDP']])

In [6]:
df.isnull().mean()*100

trade_date                     0.0
volume                         0.0
price_usd                      0.0
price_btc                      0.0
market_cap                     0.0
capitalization_change_1_day    0.0
USD_price_change_1_day         0.0
BTC_price_change_1_day         0.0
crypto_name                    0.0
crypto_type                    0.0
ticker                         0.0
max_supply                     0.0
site_url                       0.0
github_url                     0.0
minable                        0.0
platform_name                  0.0
industry_name                  0.0
Unemployment_Rate              0.0
CPI                            0.0
CCI                            0.0
PPI                            0.0
GDP                            0.0
dtype: float64

In [148]:
df.describe()

Unnamed: 0,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,BTC_price_change_1_day,crypto_type,max_supply,minable,Unemployment_Rate,CPI,CCI,PPI,GDP
count,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0
mean,10411890000.0,5631.871529,0.999427,98822150000.0,0.002966,0.002847,0.0,0.0,21000000.0,1.0,4.916667,249.671468,93.409794,194.843471,20313.444935
std,12507780000.0,4055.940993,0.023932,72384760000.0,0.039423,0.039418,0.0,0.0,0.0,0.0,2.252823,6.820132,7.433065,6.388487,1094.833334
min,28514000.0,364.33,0.0,5496598000.0,-0.371631,-0.371695,0.0,0.0,21000000.0,1.0,3.5,237.336,71.8,181.3,18525.933
25%,277701200.0,1126.8975,1.0,18319710000.0,-0.011063,-0.011156,0.0,0.0,21000000.0,1.0,3.8,244.004,91.2,190.7,19280.084
50%,5004655000.0,6252.71,1.0,108355600000.0,0.002218,0.002065,0.0,0.0,21000000.0,1.0,4.3,250.792,96.2,195.5,20328.553
75%,17266850000.0,8833.975,1.0,157790800000.0,0.017371,0.017315,0.0,0.0,21000000.0,1.0,4.8,255.848,98.3,199.3,21384.775
max,74156770000.0,19497.4,1.0,326502500000.0,0.252614,0.252472,0.0,0.0,21000000.0,1.0,14.8,260.895,101.4,204.6,22024.502


Drop insignificant columns

In [10]:
df.drop(['BTC_price_change_1_day', 'crypto_type', 'minable', 'site_url', 'github_url'], axis=1, inplace=True)

In [11]:
df.drop_duplicates()

Unnamed: 0,trade_date,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,crypto_name,ticker,max_supply,platform_name,industry_name,Unemployment_Rate,CPI,CCI,PPI,GDP
0,2016-01-01,3.627890e+07,434.33,1.0,6.529300e+09,0.000000,0.000000,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
1,2016-01-02,3.009660e+07,433.44,1.0,6.517390e+09,-0.001824,-0.002049,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
2,2016-01-03,3.963380e+07,430.01,1.0,6.467430e+09,-0.007666,-0.007913,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
3,2016-01-04,3.847750e+07,433.09,1.0,6.515713e+09,0.007466,0.007163,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
4,2016-01-05,3.452260e+07,431.96,1.0,6.500393e+09,-0.002351,-0.002609,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1741,2020-10-29,5.649950e+10,13437.88,1.0,2.489953e+11,0.012588,0.012553,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1742,2020-10-30,3.058149e+10,13546.52,1.0,2.510182e+11,0.008124,0.008085,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1743,2020-10-31,3.030646e+10,13780.99,1.0,2.553721e+11,0.017345,0.017309,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502
1744,2020-11-01,2.445386e+10,13737.11,1.0,2.545698e+11,-0.003142,-0.003184,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.7,260.895,76.9,198.3,22024.502


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   trade_date                   1746 non-null   datetime64[ns]
 1   volume                       1746 non-null   float64       
 2   price_usd                    1746 non-null   float64       
 3   price_btc                    1746 non-null   float64       
 4   market_cap                   1746 non-null   float64       
 5   capitalization_change_1_day  1746 non-null   float64       
 6   USD_price_change_1_day       1746 non-null   float64       
 7   crypto_name                  1746 non-null   object        
 8   ticker                       1746 non-null   object        
 9   max_supply                   1746 non-null   float64       
 10  platform_name                1746 non-null   object        
 11  industry_name                1746 non-null 

In [152]:
df['crypto_name'].unique()

array(['Bitcoin'], dtype=object)

In [153]:
df['ticker'].unique()

array(['BTC'], dtype=object)

In [154]:
df['platform_name'].unique()

array(['XRP'], dtype=object)

In [155]:
df['industry_name'].unique()

array(['Proof of Work (PoW)'], dtype=object)

Extracting Significant Date Features

In [12]:
df['year'] = [i.year for i in df['trade_date']]
df['month_name'] = [i.month_name() for i in df['trade_date']]
df['day_name'] = [i.day_name() for i in df['trade_date']]

In [13]:
df.reset_index(drop=True)

Unnamed: 0,trade_date,volume,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,crypto_name,ticker,max_supply,platform_name,industry_name,Unemployment_Rate,CPI,CCI,PPI,GDP,year,month_name,day_name
0,2016-01-01,3.627890e+07,434.33,1.0,6.529300e+09,0.000000,0.000000,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933,2016,January,Friday
1,2016-01-02,3.009660e+07,433.44,1.0,6.517390e+09,-0.001824,-0.002049,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933,2016,January,Saturday
2,2016-01-03,3.963380e+07,430.01,1.0,6.467430e+09,-0.007666,-0.007913,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933,2016,January,Sunday
3,2016-01-04,3.847750e+07,433.09,1.0,6.515713e+09,0.007466,0.007163,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933,2016,January,Monday
4,2016-01-05,3.452260e+07,431.96,1.0,6.500393e+09,-0.002351,-0.002609,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.8,237.652,92.0,182.6,18525.933,2016,January,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1741,2020-10-29,5.649950e+10,13437.88,1.0,2.489953e+11,0.012588,0.012553,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502,2020,October,Thursday
1742,2020-10-30,3.058149e+10,13546.52,1.0,2.510182e+11,0.008124,0.008085,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502,2020,October,Friday
1743,2020-10-31,3.030646e+10,13780.99,1.0,2.553721e+11,0.017345,0.017309,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.8,260.249,81.8,196.5,22024.502,2020,October,Saturday
1744,2020-11-01,2.445386e+10,13737.11,1.0,2.545698e+11,-0.003142,-0.003184,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),6.7,260.895,76.9,198.3,22024.502,2020,November,Sunday


In [14]:
df.rename(columns={'volume':'volume_USD'}, inplace=True)

In [161]:
df.describe()

Unnamed: 0,volume_USD,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,max_supply,Unemployment_Rate,CPI,CCI,PPI,GDP,year,month
count,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0
mean,10411890000.0,5631.871529,0.999427,98822150000.0,0.002966,0.002847,21000000.0,4.916667,249.671468,93.409794,194.843471,20313.444935,2017.914089,6.328179
std,12507780000.0,4055.940993,0.023932,72384760000.0,0.039423,0.039418,0.0,2.252823,6.820132,7.433065,6.388487,1094.833334,1.387028,3.385368
min,28514000.0,364.33,0.0,5496598000.0,-0.371631,-0.371695,21000000.0,3.5,237.336,71.8,181.3,18525.933,2016.0,1.0
25%,277701200.0,1126.8975,1.0,18319710000.0,-0.011063,-0.011156,21000000.0,3.8,244.004,91.2,190.7,19280.084,2017.0,3.0
50%,5004655000.0,6252.71,1.0,108355600000.0,0.002218,0.002065,21000000.0,4.3,250.792,96.2,195.5,20328.553,2018.0,6.0
75%,17266850000.0,8833.975,1.0,157790800000.0,0.017371,0.017315,21000000.0,4.8,255.848,98.3,199.3,21384.775,2019.0,9.0
max,74156770000.0,19497.4,1.0,326502500000.0,0.252614,0.252472,21000000.0,14.8,260.895,101.4,204.6,22024.502,2020.0,12.0


Checking Outliers

In [162]:
px.box(data_frame=df, x='volume_USD')

In [163]:
px.box(data_frame=df, x='market_cap')

In [164]:
px.box(data_frame=df, x='Unemployment_Rate')

In [165]:
px.box(data_frame=df, x='CCI')

In [166]:
px.box(data_frame=df, x='GDP')

Locating Outliers In Volume_USD

In [22]:
outliers_volume = detect_outliers(df,0,['volume_USD'])
outliers_volume

[1271,
 1384,
 1462,
 1490,
 1491,
 1492,
 1493,
 1494,
 1495,
 1496,
 1497,
 1498,
 1502,
 1504,
 1505,
 1506,
 1509,
 1516,
 1519,
 1520,
 1523,
 1526,
 1527,
 1530,
 1531,
 1532,
 1540,
 1544,
 1545,
 1548,
 1554,
 1561,
 1567,
 1568,
 1569,
 1571,
 1572,
 1573,
 1574,
 1575,
 1576,
 1577,
 1578,
 1579,
 1581,
 1582,
 1583,
 1694,
 1699,
 1733,
 1741]

Locating Outliers in Unemployment_Rate

In [23]:
outliers_UR = detect_outliers(df,0,['Unemployment_Rate'])
outliers_UR

[1539,
 1540,
 1541,
 1542,
 1543,
 1544,
 1545,
 1546,
 1547,
 1548,
 1549,
 1550,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 1573,
 1574,
 1575,
 1576,
 1577,
 1578,
 1579,
 1580,
 1581,
 1582,
 1583,
 1584,
 1585,
 1586,
 1587,
 1588,
 1589,
 1590,
 1591,
 1592,
 1593,
 1594,
 1595,
 1596,
 1597,
 1598,
 1599,
 1600,
 1601,
 1602,
 1603,
 1604,
 1605,
 1606,
 1607,
 1608,
 1609,
 1610,
 1611,
 1612,
 1613,
 1614,
 1615,
 1616,
 1617,
 1618,
 1619,
 1620,
 1621,
 1622,
 1623,
 1624,
 1625,
 1626,
 1627,
 1628,
 1629,
 1630,
 1631,
 1632,
 1633,
 1634,
 1635,
 1636,
 1637,
 1638,
 1639,
 1640,
 1641,
 1642,
 1643,
 1644,
 1645,
 1646,
 1647,
 1648,
 1649,
 1650,
 1651,
 1652,
 1653,
 1654,
 1655,
 1656,
 1657,
 1658,
 1659,
 1660,
 1661,
 1662,
 1663,
 1664,
 1665,
 1666,
 1667,
 1668,
 1669,
 1670,
 1671,
 1672,
 1673,
 1674,
 1675,
 1676,
 1677,
 1678,
 1679,
 1680,
 1681,

Locating Outliers in CCI

In [24]:
outliers_CCI = detect_outliers(df,0,['CCI'])
outliers_CCI

[1539,
 1540,
 1541,
 1542,
 1543,
 1544,
 1545,
 1546,
 1547,
 1548,
 1549,
 1550,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 1573,
 1574,
 1575,
 1576,
 1577,
 1578,
 1579,
 1580,
 1581,
 1582,
 1583,
 1584,
 1585,
 1586,
 1587,
 1588,
 1589,
 1590,
 1591,
 1592,
 1593,
 1594,
 1595,
 1596,
 1597,
 1598,
 1599,
 1600,
 1601,
 1602,
 1603,
 1604,
 1605,
 1606,
 1607,
 1608,
 1609,
 1610,
 1611,
 1612,
 1613,
 1614,
 1615,
 1616,
 1617,
 1618,
 1619,
 1620,
 1621,
 1622,
 1623,
 1624,
 1625,
 1626,
 1627,
 1628,
 1629,
 1630,
 1631,
 1632,
 1633,
 1634,
 1635,
 1636,
 1637,
 1638,
 1639,
 1640,
 1641,
 1642,
 1643,
 1644,
 1645,
 1646,
 1647,
 1648,
 1649,
 1650,
 1651,
 1652,
 1653,
 1654,
 1655,
 1656,
 1657,
 1658,
 1659,
 1660,
 1661,
 1662,
 1663,
 1664,
 1665,
 1666,
 1667,
 1668,
 1669,
 1670,
 1671,
 1672,
 1673,
 1674,
 1675,
 1676,
 1677,
 1678,
 1679,
 1680,
 1681,

UniVariant and BiVariant Analysis

Describtive Analysis of the Numerical Features

In [230]:
px.histogram(data_frame=df, x='price_usd',text_auto=True, marginal='box')

In [231]:
px.histogram(data_frame=df, x='market_cap',text_auto=True, marginal='box')

In [232]:
px.histogram(data_frame=df, x='capitalization_change_1_day',text_auto=True, marginal='box')

In [233]:
px.histogram(data_frame=df, x='USD_price_change_1_day',text_auto=True, marginal='box')

In [234]:
px.histogram(data_frame=df, x='Unemployment_Rate',text_auto=True, marginal='box')

In [235]:
px.histogram(data_frame=df, x='CPI',text_auto=True, marginal='box')

In [236]:
px.histogram(data_frame=df, x='CCI',text_auto=True, marginal='box')

In [237]:
px.histogram(data_frame=df, x='PPI',text_auto=True, marginal='box')

In [238]:
px.histogram(data_frame=df, x='GDP',text_auto=True, marginal='box')

In [239]:
df.describe()

Unnamed: 0,volume_USD,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,max_supply,Unemployment_Rate,CPI,CCI,PPI,GDP,year,month
count,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0
mean,10411890000.0,5631.871529,0.999427,98822150000.0,0.002966,0.002847,21000000.0,4.916667,249.671468,93.409794,194.843471,20313.444935,2017.914089,6.328179
std,12507780000.0,4055.940993,0.023932,72384760000.0,0.039423,0.039418,0.0,2.252823,6.820132,7.433065,6.388487,1094.833334,1.387028,3.385368
min,28514000.0,364.33,0.0,5496598000.0,-0.371631,-0.371695,21000000.0,3.5,237.336,71.8,181.3,18525.933,2016.0,1.0
25%,277701200.0,1126.8975,1.0,18319710000.0,-0.011063,-0.011156,21000000.0,3.8,244.004,91.2,190.7,19280.084,2017.0,3.0
50%,5004655000.0,6252.71,1.0,108355600000.0,0.002218,0.002065,21000000.0,4.3,250.792,96.2,195.5,20328.553,2018.0,6.0
75%,17266850000.0,8833.975,1.0,157790800000.0,0.017371,0.017315,21000000.0,4.8,255.848,98.3,199.3,21384.775,2019.0,9.0
max,74156770000.0,19497.4,1.0,326502500000.0,0.252614,0.252472,21000000.0,14.8,260.895,101.4,204.6,22024.502,2020.0,12.0


In [25]:
px.histogram(data_frame=df, y='USD_price_change_1_day', x='day_name', text_auto=True, histfunc='avg')

What is the Average Price Per month ?

In [171]:
px.histogram(df,x='month_name', y='price_usd', histfunc='avg', text_auto=True)

What is the  Average Price Per month for Each Year?

In [172]:
px.histogram(df,x='month_name', y='price_usd', histfunc='avg', text_auto=True, facet_col='year')

In [240]:
msk2018 = df['year'] == 2018

In [243]:
px.histogram(df[msk2018],x='month_name', y='market_cap', histfunc='avg', text_auto=True)

In [244]:
px.histogram(df[msk2018],x='month_name', y='volume_USD', histfunc='avg', text_auto=True)

In [245]:
px.histogram(df[msk2018],x='month_name', y='capitalization_change_1_day', histfunc='avg', text_auto=True)

In [246]:
px.histogram(df[msk2018],x='month_name', y='Unemployment_Rate', histfunc='avg', text_auto=True)

What is the Average Volume Per month ?

In [173]:
px.histogram(df,x='month_name', y='volume_USD', histfunc='avg', text_auto=True)

What is the Average Volume Per month for Each Year ?

In [174]:
px.histogram(df,x='month_name', y='volume_USD', histfunc='avg', text_auto=True, facet_col='year')

What is the Average Market_cap Per Month ?

In [175]:
px.histogram(df,x='month_name', y='market_cap', histfunc='avg', text_auto=True)

What is the Average Market_cap Per Month for Each Year ?

In [176]:
px.histogram(df,x='month_name', y='market_cap', histfunc='avg', text_auto=True, facet_col='year')

What is the Average Capitlization_change_1_day Per Month?

In [177]:
px.histogram(df, x='month_name', y='capitalization_change_1_day',histfunc='avg', text_auto=True)

What is the Average Capitlization_change_1_day Per Month for Each Year ?

In [178]:
px.histogram(df, x='month_name', y='capitalization_change_1_day', text_auto=True, histfunc='avg',facet_col='year')

What is the Average USD_price_change_1_day Per Month?

In [179]:
px.histogram(df, x='month_name', y='USD_price_change_1_day',histfunc='avg', text_auto=True)

What is the Average USD_price_change_1_day Per Month for Each Year ?

In [180]:
px.histogram(df, x='month_name', y='USD_price_change_1_day', histfunc='avg',text_auto=True, facet_col='year')

What is the Average Unemployment_Rate Per Month ?

In [181]:
px.histogram(df, x='month_name', y='Unemployment_Rate',histfunc='avg', text_auto=True)

What is the Average Unemployment_Rate Per Month for Each Year ?

In [182]:
px.histogram(df, x='month_name', y='Unemployment_Rate',histfunc='avg', text_auto=True, facet_col='year')

What is the Average CPI Per Month ?

In [183]:
px.histogram(df, x='month_name', y='CPI',histfunc='avg', text_auto=True)

What is the Average CPI Per Month for Each Year ?

In [184]:
px.histogram(df, x='month_name', y='CPI',histfunc='avg', text_auto=True, facet_col='year')

What is the Average CCI Per Month ?

In [185]:
px.histogram(df, x='month_name', y='CCI',histfunc='avg', text_auto=True)

What is the Average CCI Per Month for Each Year ?

In [186]:
px.histogram(df, x='month_name', y='CCI',histfunc='avg', text_auto=True, facet_col='year')

What is the Average PPI Per Month ?

In [187]:
px.histogram(df, x='month_name', y='PPI',histfunc='avg', text_auto=True)

What is the Average PPI Per Month for Each Year?

In [188]:
px.histogram(df, x='month_name', y='PPI',histfunc='avg', text_auto=True, facet_col='year')

What is the Average GDP Per Month ?

In [189]:
px.histogram(df, x='month_name', y='GDP',histfunc='avg', text_auto=True)

What is the Average GDP Per Month for Each Year ?

In [190]:
px.histogram(df, x='month_name', y='GDP',histfunc='avg', text_auto=True, facet_col='year')

Correlating between All numerical Features

In [191]:
px.imshow(df.select_dtypes('number').corr(), text_auto=True,height=1000)

Show Bitcoin Change trend Per year

In [192]:
px.line(df.groupby(by='year')['price_usd'].mean(), markers=True)

Turning Outliers into NULL Values

In [25]:
df['volume_USD'].loc[outliers_volume] = np.nan
df['Unemployment_Rate'].loc[outliers_UR] = np.nan
df['CCI'].loc[outliers_CCI] = np.nan



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Filling NULL With KNN Imputer

In [26]:
knn_imputer = KNNImputer(n_neighbors = 3)
df[['volume_USD', 'Unemployment_Rate', 'CCI']] = knn_imputer.fit_transform(df[['volume_USD', 'Unemployment_Rate', 'CCI']])

Test and Train Split

In [27]:
df

Unnamed: 0,trade_date,volume_USD,price_usd,price_btc,market_cap,capitalization_change_1_day,USD_price_change_1_day,crypto_name,ticker,max_supply,platform_name,industry_name,Unemployment_Rate,CPI,CCI,PPI,GDP,year,month_name,day_name
0,2016-01-01,3.627890e+07,434.33,1.0,6.529300e+09,0.000000,0.000000,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.800000,237.652,92.000000,182.6,18525.933,2016,January,Friday
1,2016-01-02,3.009660e+07,433.44,1.0,6.517390e+09,-0.001824,-0.002049,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.800000,237.652,92.000000,182.6,18525.933,2016,January,Saturday
2,2016-01-03,3.963380e+07,430.01,1.0,6.467430e+09,-0.007666,-0.007913,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.800000,237.652,92.000000,182.6,18525.933,2016,January,Sunday
3,2016-01-04,3.847750e+07,433.09,1.0,6.515713e+09,0.007466,0.007163,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.800000,237.652,92.000000,182.6,18525.933,2016,January,Monday
4,2016-01-05,3.452260e+07,431.96,1.0,6.500393e+09,-0.002351,-0.002609,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.800000,237.652,92.000000,182.6,18525.933,2016,January,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1741,2020-10-29,2.024767e+09,13437.88,1.0,2.489953e+11,0.012588,0.012553,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),4.400000,260.249,81.800000,196.5,22024.502,2020,October,Thursday
1742,2020-10-30,3.058149e+10,13546.52,1.0,2.510182e+11,0.008124,0.008085,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),3.600000,260.249,81.800000,196.5,22024.502,2020,October,Friday
1743,2020-10-31,3.030646e+10,13780.99,1.0,2.553721e+11,0.017345,0.017309,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),3.600000,260.249,81.800000,196.5,22024.502,2020,October,Saturday
1744,2020-11-01,2.445386e+10,13737.11,1.0,2.545698e+11,-0.003142,-0.003184,Bitcoin,BTC,21000000.0,XRP,Proof of Work (PoW),3.633333,260.895,87.866667,198.3,22024.502,2020,November,Sunday


In [29]:
x = df.drop(['price_usd'], axis=1)
y = df['price_usd']

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)