In [1]:
import pandas as pd
import numpy as np

from utils import (
    COMPLETE_DATA_FILE_PATH,
    CLEAN_COMPLETE_DATA_FILE_PATH)

TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-05


# Data Overview

In [2]:
data = pd.read_parquet(COMPLETE_DATA_FILE_PATH)

data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year

data.set_index('date', inplace=True)

# Get a high-level overview
print(data.info())
# print(data.describe())

# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1182033 entries, 2020-06-01 to 2024-08-12
Data columns (total 81 columns):
 #   Column                             Non-Null Count    Dtype         
---  ------                             --------------    -----         
 0   symbol                             1182033 non-null  object        
 1   target                             1182033 non-null  float32       
 2   open                               1182033 non-null  float64       
 3   high                               1182033 non-null  float64       
 4   low                                1182033 non-null  float64       
 5   close                              1182033 non-null  float64       
 6   volume                             1182033 non-null  int64         
 7   name                               1182033 non-null  object        
 8   keywords                           1177984 non-null  object        
 9   total_supply                       1182033 non-null  float64    

cpi_Japan               995763
interest_rate_Russia    808481
interest_rate_India     808073
cpi_Russia              784585
source_code             412289
dtype: int64

# Missing values

In [3]:
threshold = 50

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify columns with more than 70% missing values
columns_with_high_missing = missing_percentages[missing_percentages > threshold].index

# Create a list of columns to keep
columns_to_keep = [col for col in data.columns if col not in columns_with_high_missing]

print(f"Columns with more than {threshold}% missing data: {columns_with_high_missing}")
print(f"Removing high value missing columns")

# Drop the columns with too many missing values
data = data.drop(columns=columns_with_high_missing)

print(f'After cleaning: {data.shape[0]} rows and {len(data.columns)} columns.')


Columns with more than 50% missing data: Index(['interest_rate_India', 'interest_rate_Russia', 'cpi_Russia',
       'cpi_Japan'],
      dtype='object')
Removing high value missing columns
After cleaning: 1182033 rows and 77 columns.


In [4]:
# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

Missing values


source_code             412289
cpi_Canada              250699
cpi_South Korea         250699
interest_rate_Brazil    223526
cpi_India                89846
dtype: int64

# Handling Remaining Missing Values

In [5]:
# Fill missing values in the 'source_code' column with ''
data['source_code'] = data['source_code'].fillna('')

# Fill missing values in the 'keywords' column with ''
data['keywords'] = data['keywords'].fillna('')

# Fill missing values in the google trend column with 0
data['google_trend_score'] = data['google_trend_score'].fillna(0)

economic_columns = [col for col in data.columns if col.startswith(('interest_rate', 'inflation_rate', 'gdp', 'cpi'))]

# Interpolate the missing values for GDP, CPI, inflation rates and interest rates
data[economic_columns] = data[economic_columns].interpolate(method='time')


In [6]:
# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

Missing values


fear_greed_value             1193
fear_greed_classification    1193
dtype: int64

# Save the dataset & sample

In [7]:
# Reorder the columns
data.reset_index(inplace=True)
data.to_parquet(CLEAN_COMPLETE_DATA_FILE_PATH, index=False)

sample_dataset = data.sample(n=1000, random_state=42)
sample_dataset.to_csv('../data/sample.csv', index=False)

data.head()

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_India,cpi_Italy,cpi_Brazil,cpi_South Korea,cpi_Mexico,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
1,2020-06-01,SENSO,0.0,0.240752,0.257436,0.238777,0.248901,1562604,SENSO,SENSO Token,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
2,2020-06-01,FCT,0.0,1.810519,1.916731,1.765896,1.890572,405259,FirmaChain,FirmaChain,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
3,2020-06-01,ADX,0.0,0.092347,0.095945,0.090926,0.093641,444095,AdEx,AdEx,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
4,2020-06-01,ARRR,0.0,0.029034,0.029254,0.025792,0.028084,79637,Pirate Chain,Pirate Chain,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
