In [1]:
import pandas as pd
import numpy as np

from utils import (
    COMPLETE_DATA_FILE_PATH,
    CLEAN_COMPLETE_DATA_FILE_PATH)

TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-05


# Data Overview

In [2]:
data = pd.read_parquet(COMPLETE_DATA_FILE_PATH)

data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year

data.set_index('date', inplace=True)

# Get a high-level overview
print(data.info())
# print(data.describe())

# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1654463 entries, 2020-06-01 to 2024-08-12
Data columns (total 80 columns):
 #   Column                             Non-Null Count    Dtype         
---  ------                             --------------    -----         
 0   symbol                             1654463 non-null  object        
 1   open                               1654463 non-null  float64       
 2   high                               1654463 non-null  float64       
 3   low                                1654463 non-null  float64       
 4   close                              1654463 non-null  float64       
 5   volume                             1654463 non-null  int64         
 6   name                               1654463 non-null  object        
 7   keywords                           1648813 non-null  object        
 8   total_supply                       1654463 non-null  float64       
 9   circulating_supply                 1654463 non-null  float64    

cpi_Japan               1393736
interest_rate_India     1137894
interest_rate_Russia    1131514
cpi_Russia              1098066
source_code              577075
dtype: int64

# Missing values

In [3]:
threshold = 50

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify columns with more than 70% missing values
columns_with_high_missing = missing_percentages[missing_percentages > threshold].index

# Create a list of columns to keep
columns_to_keep = [col for col in data.columns if col not in columns_with_high_missing]

print(f"Columns with more than {threshold}% missing data: {columns_with_high_missing}")
print(f"Removing high value missing columns")

# Drop the columns with too many missing values
data = data.drop(columns=columns_with_high_missing)

print(f'After cleaning: {data.shape[0]} rows and {len(data.columns)} columns.')


Columns with more than 50% missing data: Index(['interest_rate_India', 'interest_rate_Russia', 'cpi_Russia',
       'cpi_Japan'],
      dtype='object')
Removing high value missing columns
After cleaning: 1654463 rows and 76 columns.


In [4]:
# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

Missing values


source_code             577075
cpi_Canada              351367
cpi_South Korea         351367
interest_rate_Brazil    314323
cpi_India               126240
dtype: int64

# Handling Remaining Missing Values

In [5]:
# Fill missing values in the 'source_code' column with ''
data['source_code'] = data['source_code'].fillna('')

# Fill missing values in the 'keywords' column with ''
data['keywords'] = data['keywords'].fillna('')

# Fill missing values in the google trend column with 0
data['google_trend_score'] = data['google_trend_score'].fillna(0)

economic_columns = [col for col in data.columns if col.startswith(('interest_rate', 'inflation_rate', 'gdp', 'cpi'))]

# Interpolate the missing values for GDP, CPI, inflation rates and interest rates
data[economic_columns] = data[economic_columns].interpolate(method='time')


In [6]:
# Check for missing values and sort them in descending order
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Print the missing values in descending order
print(f'Missing values')
missing_values.head()

Missing values


fear_greed_value             1193
fear_greed_classification    1193
dtype: int64

# Save the dataset & sample

In [7]:
# Reorder the columns
data.reset_index(inplace=True)
data.to_parquet(CLEAN_COMPLETE_DATA_FILE_PATH, index=False)

sample_dataset = data.sample(n=1000, random_state=42)
sample_dataset.to_csv('../data/sample.csv', index=False)

data.head()

Unnamed: 0,date,symbol,open,high,low,close,volume,name,keywords,total_supply,...,cpi_India,cpi_Italy,cpi_Brazil,cpi_South Korea,cpi_Mexico,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999980.0,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
1,2020-06-01,BTT,0.0003,0.000312,0.000298,0.000311,67136753,BitTorrent [New],BitTorrent coin,990000000000000.0,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
2,2020-06-01,ASD,0.037355,0.038924,0.03707,0.038295,2117671,ASD,ASD,780615300.0,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
3,2020-06-01,NWC,0.021969,0.022889,0.021884,0.022715,238352,Numerico,Numerico,270050500.0,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020
4,2020-06-01,DAD,0.242849,0.408407,0.24223,0.272164,10332685,DAD,DAD,1000000000.0,...,128.5304,102.7,124.0019,105.027448,122.5766,107.8078,96.96129,106.97271,106.01,2020


: 