In [48]:
import pandas as pd

from utils import (
    OHLCV_FILE_PATH,
    FEAR_GREED_DATA_FILE_PATH,
    COIN_DETAILS_FILE_PATH,
    MACRO_ECO_FACTORS_DATA_FILE_PATH,
    GOOGLE_TRENDS_DATA_FILE_PATH, 
    TRAIN_TARGETS_PARQUET_FILE_PATH,
    TRAIN_DATASET_FILE_PATH,
    VALIDATION_DATASET_FILE_PATH,
    parse_date,
    get_week_start,
    get_week_end)

# Integrate OHLCV Data

In [49]:
ohlcv_data = pd.read_csv(OHLCV_FILE_PATH)
ohlcv_data['date'] = pd.to_datetime(ohlcv_data['date'], errors='coerce').dt.date

print(f'Rows of ohlcv data: {ohlcv_data.shape[0]}')
ohlcv_data.head()


Rows of ohlcv data: 1644565


Unnamed: 0,symbol,date,open,high,low,close,volume
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147


# Integrate coin details data

In [50]:
coin_info_data = pd.read_csv(COIN_DETAILS_FILE_PATH, index_col=None)

print(f'Rows of coin data: {coin_info_data.shape[0]}')
coin_info_data.head()

Rows of coin data: 1435


Unnamed: 0,symbol,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active
0,NEO,Neo,Neo,100000000.0,70538830.0,651943100.0,0,1,https://github.com/neo-project/neo,1
1,FARA,FaraLand,FaraLand,99930070.0,22928680.0,274767.4,0,0,,1
2,ALI,Artificial Liquid Intelligence,Artificial Liquid Intelligence,9872434000.0,6180790000.0,51509740.0,0,1,https://github.com/AI-Protocol-Official,1
3,HONEY,Hivemapper,Hivemapper,6342243000.0,2314560000.0,160370400.0,0,1,https://github.com/hivemapper,1
4,STRAX,Stratis [New],Stratis coin,1951821000.0,1951815000.0,86076230.0,1,1,https://github.com/stratisproject,1


In [51]:
# Merge the coin name with the OHLCV, Fear & Greed Index
data = ohlcv_data.merge(coin_info_data, on='symbol', how='left')

print(f'Number of rows in the data: {data.shape[0]}')
data.head()

Number of rows in the data: 1644565


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0


# Integrate Fear & Greed Index

In [52]:
fear_greed_df = pd.read_csv(FEAR_GREED_DATA_FILE_PATH)
fear_greed_df['date'] = pd.to_datetime(fear_greed_df['date'], errors='coerce').dt.date

print(f'Rows of fear & greed data: {fear_greed_df.shape[0]}')
fear_greed_df.head()

Rows of fear & greed data: 1531


Unnamed: 0,value,value_classification,date
0,50,Neutral,2020-06-01
1,56,Greed,2020-06-02
2,48,Neutral,2020-06-03
3,54,Neutral,2020-06-04
4,53,Neutral,2020-06-05


In [53]:
data = data.merge(fear_greed_df[['date', 'value', 'value_classification']], left_on='date', right_on='date', how='left')
data.rename(columns={'value': 'fear_greed_value'}, inplace=True)  # Rename the value column
data.rename(columns={'value_classification': 'fear_greed_classification'}, inplace=True)  # Rename the value column

print(f'Number of rows in the data: {data.shape[0]}')
data.head()

Number of rows in the data: 1644565


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,50,Neutral
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,56,Greed
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,48,Neutral
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,54,Neutral
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,53,Neutral


# Integrate Google Trends data

In [54]:
trends_data = pd.read_csv(GOOGLE_TRENDS_DATA_FILE_PATH)

trends_data['date'] = pd.to_datetime(trends_data['date'])
trends_data['date'] = trends_data['date'].dt.date

print(f'Number of rows in google trends data: {trends_data.shape[0]}')
trends_data.head()

Number of rows in google trends data: 311522


Unnamed: 0,date,symbol,name,trend_score
0,2020-06-07,BNX,BinaryX,0
1,2020-06-07,STEP,Step Finance,19
2,2020-06-07,METAL,BADMAD ROBOTS,0
3,2020-06-07,KIN,Kin,40
4,2020-06-07,BRZE,Breeze coin,0


In [55]:
# Apply the function to create a 'week_start' & 'week_end' column in the main data
data['week_start'] = data['date'].apply(get_week_start)
data['week_end'] = data['date'].apply(get_week_end)
trends_data['week_start'] = trends_data['date'].apply(get_week_start)

data['week_start'] = pd.to_datetime(data['week_start'])
data['week_start'] = pd.to_datetime(data['week_start'])
trends_data['week_start'] = pd.to_datetime(trends_data['week_start'])

data.head()


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification,week_start,week_end
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,50,Neutral,2020-06-01,2020-06-07
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,56,Greed,2020-06-01,2020-06-07
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,48,Neutral,2020-06-01,2020-06-07
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,54,Neutral,2020-06-01,2020-06-07
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,53,Neutral,2020-06-01,2020-06-07


In [56]:
# Set 'symbol' and 'week_start' as the index for the trends data and forward-fill
trends_data.set_index(['symbol', 'week_start'], inplace=True)
trends_data = trends_data.groupby('symbol').ffill().reset_index()

# Merge the OHLCV data with the forward-filled trends data
data = pd.merge(data, trends_data, on=['symbol', 'week_start'], how='left')

data = data.drop(columns=['date_y', 'name_y'])
data = data.rename(columns={'date_x': 'date', 'name_x': 'name', 'trend_score': 'google_trend_score'})

# Check the merged data
print(f'Number of rows in the data: {data.shape[0]}')
data.head()


Number of rows in the data: 1644565


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,...,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification,week_start,week_end,google_trend_score
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,50,Neutral,2020-06-01,2020-06-07,0.0
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,56,Greed,2020-06-01,2020-06-07,0.0
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,48,Neutral,2020-06-01,2020-06-07,0.0
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,54,Neutral,2020-06-01,2020-06-07,0.0
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0.0,1.0,https://github.com/0xbitcoin/0xbitcoin-token,1.0,53,Neutral,2020-06-01,2020-06-07,0.0


# Integrate Macro-Economic Factors

In [57]:
data['date'] = pd.to_datetime(data['date'])

economic_data = pd.read_csv(MACRO_ECO_FACTORS_DATA_FILE_PATH)
economic_data['date'] = pd.to_datetime(economic_data['date'])

# Set 'date' as the index
data.set_index('date', inplace=True)
economic_data.set_index('date', inplace=True)

# Resample monthly data to daily frequency and forward-fill values
economic_data_resampled = economic_data.resample('D').ffill()

# Reset index to get the date column back
economic_data_resampled.reset_index(inplace=True)

# Merge the two DataFrames on the date column
data = pd.merge(data, economic_data_resampled, on='date', how='left')

print(f'Number of rows in the data: {data.shape[0]}')
data.head(100)

Number of rows in the data: 1644565


Unnamed: 0,date,symbol,open,high,low,close,volume,name,keywords,total_supply,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
1,2020-06-02,0xBTC,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
2,2020-06-03,0xBTC,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
3,2020-06-04,0xBTC,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
4,2020-06-05,0xBTC,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-09-04,0xBTC,0.088969,0.102752,0.079269,0.097443,1850641,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
96,2020-09-05,0xBTC,0.097426,0.102197,0.077869,0.083904,1485673,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
97,2020-09-06,0xBTC,0.083904,0.097919,0.078028,0.088231,1464089,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
98,2020-09-07,0xBTC,0.088232,0.093056,0.081729,0.092534,1487425,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0


# Integrate train targets

In [58]:
train_targets_df = pd.read_parquet(TRAIN_TARGETS_PARQUET_FILE_PATH)
train_targets_df['date'] = pd.to_datetime(train_targets_df['date'])

data = pd.merge(data, train_targets_df, on=['symbol', 'date'], how='left')

cols = list(data.columns)
new_order = ['date', 'symbol', 'target'] + [col for col in cols if col not in ['date', 'symbol', 'target']]
data = data[new_order]

# Filter out rows where 'date' falls on weekends
data = data[data['date'].dt.weekday < 5]

print(f'Number of rows in the data: {data.shape[0]}')
data.head(20)

Number of rows in the data: 1175458


Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
1,2020-06-02,0xBTC,0.25,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
2,2020-06-03,0xBTC,0.25,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
3,2020-06-04,0xBTC,0.25,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
4,2020-06-05,0xBTC,0.25,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
7,2020-06-08,0xBTC,0.25,0.138044,0.140463,0.106984,0.129789,1136606,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
8,2020-06-09,0xBTC,0.5,0.128043,0.132123,0.119225,0.122679,1136783,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
9,2020-06-10,0xBTC,0.5,0.122692,0.128392,0.10546,0.116509,1210863,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
10,2020-06-11,0xBTC,0.75,0.116511,0.123852,0.100762,0.107118,997487,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
11,2020-06-12,0xBTC,0.75,0.107072,0.113181,0.10473,0.111646,1007215,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0


# Save the train dataset

In [59]:
# Determine the split point
split_point = int(len(data) * 0.8)

data['target'] = data['target'].fillna(0)
data = data.sort_values(by='date', ascending=True)

# Split the data
train_data = data.iloc[:split_point]
validation_data = data.iloc[split_point:]

# Reorder the columns
train_data.to_parquet(TRAIN_DATASET_FILE_PATH, index=False)
validation_data.to_parquet(VALIDATION_DATASET_FILE_PATH, index=False)

# train_data.to_csv('../data/train.csv', index=False)
validation_data.to_csv('../data/validate.csv', index=False)

In [60]:
train_data.head(100)

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
622737,2020-06-01,JUV,0.0,7.194488,7.584891,7.176790,7.532392,146975,Juventus Fan Token,Juventus Fan Token,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
621228,2020-06-01,ZANO,0.0,0.725818,0.739450,0.675988,0.736188,92454,Zano,Zano,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
619704,2020-06-01,LA,0.0,0.027122,0.035149,0.026726,0.034941,15897,LATOKEN,LATOKEN,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
618183,2020-06-01,XMV,0.0,0.001817,0.001838,0.001717,0.001830,21,MoneroV,MoneroV,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581656,2020-06-01,EWT,0.0,6.416263,6.548838,6.027997,6.061318,1224467,Energy Web Token,Energy Web Token,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
580131,2020-06-01,SALT,0.0,0.072705,0.076574,0.071105,0.076308,6515,SALT,SALT,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
579708,2020-06-01,INNBC,0.0,0.818189,0.821015,0.609553,0.784525,19199,Innovative Bioresearch Coin,Innovative Bioresearch Coin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
577741,2020-06-01,ETHO,0.0,0.005455,0.006077,0.004733,0.006051,14413,Etho Protocol,Etho Protocol,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0


In [61]:
validation_data.head(100)

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
1459090,2023-11-09,DOME,0.0,0.000713,0.000741,0.000667,0.000678,2084441,Everdome,Everdome,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
590537,2023-11-09,EL,0.0,0.002080,0.002188,0.002060,0.002187,136486,ELYSIA,ELYSIA,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
169841,2023-11-09,OMG,0.5,0.640361,0.669655,0.585167,0.616681,48801552,OMG Network,OMG Network,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
1614182,2023-11-09,CGPT,0.0,0.052210,0.055227,0.050684,0.052477,5099763,ChainGPT,ChainGPT,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
163760,2023-11-09,NULS,0.5,0.212408,0.219739,0.206767,0.214189,1847556,NULS,NULS coin,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648982,2023-11-09,ATT,0.0,0.003244,0.003362,0.003121,0.003360,149,Attila,Attila,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
41341,2023-11-09,BTG,0.5,15.986805,16.725216,15.520554,16.190998,32850624,Bitcoin Gold,Bitcoin Gold,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
628538,2023-11-09,TIME,0.0,15.369256,16.327766,15.049252,16.231850,566517,Chrono.tech,Chrono.tech,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
1580126,2023-11-09,COREUM,0.0,0.108606,0.110314,0.099899,0.100349,931952,Coreum,Coreum,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0


# Clean up high missing value columns

In [62]:
threshold = 70

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify columns with more than 70% missing values
columns_with_high_missing = missing_percentages[missing_percentages > threshold].index

# Create a list of columns to keep
columns_to_keep = [col for col in data.columns if col not in columns_with_high_missing]

print(f"Columns with more than {threshold}% missing data: {columns_with_high_missing}")
print(f"Columns to keep: {columns_to_keep}")

Columns with more than 70% missing data: Index(['cpi_Japan'], dtype='object')
Columns to keep: ['date', 'symbol', 'target', 'open', 'high', 'low', 'close', 'volume', 'name', 'keywords', 'total_supply', 'circulating_supply', 'market_cap', 'infinite_supply', 'is_open_source', 'source_code', 'is_active', 'fear_greed_value', 'fear_greed_classification', 'week_start', 'week_end', 'google_trend_score', 'gdp_Australia', 'gdp_Brazil', 'gdp_Canada', 'gdp_China', 'gdp_France', 'gdp_Germany', 'gdp_India', 'gdp_Italy', 'gdp_Japan', 'gdp_Korea Rep', 'gdp_Mexico', 'gdp_Russian Federation', 'gdp_Saudi Arabia', 'gdp_United Kingdom', 'gdp_United States', 'inflation_rate_Australia', 'inflation_rate_Brazil', 'inflation_rate_Canada', 'inflation_rate_China', 'inflation_rate_France', 'inflation_rate_Germany', 'inflation_rate_India', 'inflation_rate_Italy', 'inflation_rate_Japan', 'inflation_rate_Korea Rep', 'inflation_rate_Mexico', 'inflation_rate_Russian Federation', 'inflation_rate_Saudi Arabia', 'inflati