In [1]:
import pandas as pd

from utils import (
    OHLCV_FILE_PATH,
    FEAR_GREED_DATA_FILE_PATH,
    COIN_DETAILS_FILE_PATH,
    MACRO_ECO_FACTORS_DATA_FILE_PATH,
    GOOGLE_TRENDS_DATA_FILE_PATH, 
    TRAIN_TARGETS_PARQUET_FILE_PATH,
    TRAIN_DATASET_FILE_PATH,
    VALIDATION_DATASET_FILE_PATH,
    parse_date,
    get_week_start,
    get_week_end)

# Integrate OHLCV Data

In [2]:
ohlcv_data = pd.read_csv(OHLCV_FILE_PATH)
ohlcv_data['date'] = pd.to_datetime(ohlcv_data['date'], errors='coerce').dt.date

ohlcv_data.head(20)


Unnamed: 0,symbol,date,open,high,low,close,volume
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147
5,0xBTC,2020-06-06,0.126849,0.142204,0.125311,0.137667,1138067
6,0xBTC,2020-06-07,0.137667,0.140016,0.128061,0.138005,1067494
7,0xBTC,2020-06-08,0.138044,0.140463,0.106984,0.129789,1136606
8,0xBTC,2020-06-09,0.128043,0.132123,0.119225,0.122679,1136783
9,0xBTC,2020-06-10,0.122692,0.128392,0.10546,0.116509,1210863


# Integrate coin details data

In [3]:
coin_info_data = pd.read_csv(COIN_DETAILS_FILE_PATH, index_col=None)

# Merge the coin name with the OHLCV, Fear & Greed Index
data = ohlcv_data.merge(coin_info_data, on='symbol', how='left')

data.head(20)

Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,circulating_supply,market_cap,infinite_supply,source_code
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
5,0xBTC,2020-06-06,0.126849,0.142204,0.125311,0.137667,1138067,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
6,0xBTC,2020-06-07,0.137667,0.140016,0.128061,0.138005,1067494,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
7,0xBTC,2020-06-08,0.138044,0.140463,0.106984,0.129789,1136606,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
8,0xBTC,2020-06-09,0.128043,0.132123,0.119225,0.122679,1136783,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token
9,0xBTC,2020-06-10,0.122692,0.128392,0.10546,0.116509,1210863,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token


# Integrate Fear & Greed Index

In [4]:
fear_greed_df = pd.read_csv(FEAR_GREED_DATA_FILE_PATH)
fear_greed_df['date'] = pd.to_datetime(fear_greed_df['date'], errors='coerce').dt.date

data = data.merge(fear_greed_df[['date', 'value', 'value_classification']], left_on='date', right_on='date', how='left')
data.rename(columns={'value': 'fear_greed_value'}, inplace=True)  # Rename the value column
data.rename(columns={'value_classification': 'fear_greed_classification'}, inplace=True)  # Rename the value column

data.head(20)

Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,circulating_supply,market_cap,infinite_supply,source_code,fear_greed_value,fear_greed_classification
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,50,Neutral
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,56,Greed
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,48,Neutral
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,53,Neutral
5,0xBTC,2020-06-06,0.126849,0.142204,0.125311,0.137667,1138067,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral
6,0xBTC,2020-06-07,0.137667,0.140016,0.128061,0.138005,1067494,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral
7,0xBTC,2020-06-08,0.138044,0.140463,0.106984,0.129789,1136606,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,53,Neutral
8,0xBTC,2020-06-09,0.128043,0.132123,0.119225,0.122679,1136783,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,52,Neutral
9,0xBTC,2020-06-10,0.122692,0.128392,0.10546,0.116509,1210863,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral


# Integrate Google Trends data

In [5]:
trends_data = pd.read_csv(GOOGLE_TRENDS_DATA_FILE_PATH)

trends_data['date'] = trends_data['date'].apply(parse_date)
trends_data['date'] = trends_data['date'].dt.date

# Apply the function to create a 'week_start' & 'week_end' column in the main data
data['week_start'] = data['date'].apply(get_week_start)
data['week_end'] = data['date'].apply(get_week_end)

data = pd.merge(data, trends_data, left_on=['symbol', 'week_end'], right_on=['symbol', 'date'], how='left')

data = data.drop(columns=['date_y', 'name_y'])
data = data.rename(columns={'date_x': 'date', 'name_x': 'name', 'trend_score': 'google_trend_score'})
data.head(20)

Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,circulating_supply,market_cap,infinite_supply,source_code,fear_greed_value,fear_greed_classification,week_start,week_end,google_trend_score
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,50,Neutral,2020-06-01,2020-06-07,0.0
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,56,Greed,2020-06-01,2020-06-07,0.0
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,48,Neutral,2020-06-01,2020-06-07,0.0
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral,2020-06-01,2020-06-07,0.0
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,53,Neutral,2020-06-01,2020-06-07,0.0
5,0xBTC,2020-06-06,0.126849,0.142204,0.125311,0.137667,1138067,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral,2020-06-01,2020-06-07,0.0
6,0xBTC,2020-06-07,0.137667,0.140016,0.128061,0.138005,1067494,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral,2020-06-01,2020-06-07,0.0
7,0xBTC,2020-06-08,0.138044,0.140463,0.106984,0.129789,1136606,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,53,Neutral,2020-06-08,2020-06-14,0.0
8,0xBTC,2020-06-09,0.128043,0.132123,0.119225,0.122679,1136783,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,52,Neutral,2020-06-08,2020-06-14,0.0
9,0xBTC,2020-06-10,0.122692,0.128392,0.10546,0.116509,1210863,0xBitcoin,0xBitcoin,8183550.0,458736.834074,0.0,https://github.com/0xbitcoin/0xbitcoin-token,54,Neutral,2020-06-08,2020-06-14,0.0


# Integrate Macro-Economic Factors

In [6]:
data['date'] = pd.to_datetime(data['date'])

economic_data = pd.read_csv(MACRO_ECO_FACTORS_DATA_FILE_PATH)
economic_data['date'] = pd.to_datetime(economic_data['date'])

# Set 'date' as the index
data.set_index('date', inplace=True)
economic_data.set_index('date', inplace=True)

# Resample monthly data to daily frequency and forward-fill values
economic_data_resampled = economic_data.resample('D').ffill()

# Reset index to get the date column back
economic_data_resampled.reset_index(inplace=True)

# Merge the two DataFrames on the date column
data = pd.merge(data, economic_data_resampled, on='date', how='left')


data.head(100)

Unnamed: 0,date,symbol,open,high,low,close,volume,name,keywords,circulating_supply,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
0,2020-06-01,0xBTC,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
1,2020-06-02,0xBTC,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
2,2020-06-03,0xBTC,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
3,2020-06-04,0xBTC,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
4,2020-06-05,0xBTC,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-09-04,0xBTC,0.088969,0.102752,0.079269,0.097443,1850641,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
96,2020-09-05,0xBTC,0.097426,0.102197,0.077869,0.083904,1485673,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
97,2020-09-06,0xBTC,0.083904,0.097919,0.078028,0.088231,1464089,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
98,2020-09-07,0xBTC,0.088232,0.093056,0.081729,0.092534,1487425,0xBitcoin,0xBitcoin,8183550.0,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0


# Integrate train targets

In [7]:
train_targets_df = pd.read_parquet(TRAIN_TARGETS_PARQUET_FILE_PATH)
train_targets_df['date'] = pd.to_datetime(train_targets_df['date'])

data = pd.merge(data, train_targets_df, on=['symbol', 'date'], how='left')

cols = list(data.columns)
new_order = ['date', 'symbol', 'target'] + [col for col in cols if col not in ['date', 'symbol', 'target']]
data = data[new_order]

# Filter out rows where 'date' falls on weekends
data = data[data['date'].dt.weekday < 5]

data.head(20)

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
1,2020-06-02,0xBTC,0.25,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
2,2020-06-03,0xBTC,0.25,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
3,2020-06-04,0xBTC,0.25,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
4,2020-06-05,0xBTC,0.25,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
7,2020-06-08,0xBTC,0.25,0.138044,0.140463,0.106984,0.129789,1136606,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
8,2020-06-09,0xBTC,0.5,0.128043,0.132123,0.119225,0.122679,1136783,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
9,2020-06-10,0xBTC,0.5,0.122692,0.128392,0.10546,0.116509,1210863,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
10,2020-06-11,0xBTC,0.75,0.116511,0.123852,0.100762,0.107118,997487,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
11,2020-06-12,0xBTC,0.75,0.107072,0.113181,0.10473,0.111646,1007215,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0


In [8]:
data.shape[0]

1182751

# Save the train dataset

In [9]:
# Determine the split point
split_point = int(len(data) * 0.8)

data['target'] = data['target'].fillna(0)
data = data.sort_values(by='date', ascending=True)

# Split the data
train_data = data.iloc[:split_point]
validation_data = data.iloc[split_point:]

# Reorder the columns
train_data.to_parquet(TRAIN_DATASET_FILE_PATH, index=False)
validation_data.to_parquet(VALIDATION_DATASET_FILE_PATH, index=False)

train_data.to_csv('../data/train.csv', index=False)
validation_data.to_csv('../data/validate.csv', index=False)

In [10]:
train_data.head(100)

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
0,2020-06-01,0xBTC,0.00,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
153380,2020-06-01,NEO,0.25,10.959956,12.647800,10.917013,12.424067,783678511,Neo,Neo,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
688505,2020-06-01,DAWN,0.00,0.111987,0.188550,0.049547,0.088647,82537,Dawn Protocol,Dawn Protocol,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
687768,2020-06-01,PNT,0.00,0.000066,0.000070,0.000061,0.000068,11798,pNetwork,pNetwork,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
154905,2020-06-01,NEX,0.25,0.935004,1.030365,0.929747,0.963004,1287319,Nash,Nash,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641243,2020-06-01,JST,0.00,0.007686,0.007817,0.007349,0.007695,56209916,JUST,JUST,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
639800,2020-06-01,BTSE,0.00,1.340000,1.477000,1.331000,1.381000,517958,BTSE Token,BTSE Token,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
172294,2020-06-01,ORBS,0.75,0.013103,0.013827,0.013011,0.013352,1092628,Orbs,Orbs,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
638724,2020-06-01,ARDX,0.00,0.000663,0.000715,0.000661,0.000712,38488,ArdCoin,ArdCoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0


In [11]:
validation_data.head(100)

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
133289,2023-11-09,LTO,0.50,0.070485,0.071733,0.067215,0.068773,1995042,LTO Network,LTO Network,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
1372274,2023-11-09,WMT,0.50,0.154297,0.163586,0.150708,0.154750,2509539,World Mobile Token,World Mobile Token,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
1192104,2023-11-09,GOZ,0.00,0.713936,0.723211,0.692981,0.714705,161040,Göztepe S.K. Fan Token,Göztepe S.K. Fan Token,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
1565629,2023-11-09,ALEX,1.00,0.055003,0.058988,0.053239,0.054397,1350074,ALEX Lab,ᛤ ALEX 🟧 THE Finance Layer on Bitcoin ᛤᛤᛤ,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
872697,2023-11-09,NSURE,0.00,0.004474,0.005108,0.004111,0.005091,577050,Nsure.Network,Nsure.Network,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886870,2023-11-09,XFT,0.00,0.208569,0.248097,0.203687,0.248055,635,Offshift [New],Offshift project,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
410611,2023-11-09,GNO,0.75,146.972885,161.640274,144.426544,161.593536,21719962,Gnosis,Gnosis,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
599033,2023-11-09,GMAT,0.00,0.000058,0.000058,0.000056,0.000056,40187,GoWithMi,GoWithMi,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
544093,2023-11-09,HNS,0.00,0.017345,0.018745,0.017313,0.017725,72464,Handshake,Handshake,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0


# Clean up high missing value columns

In [12]:
threshold = 70

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify columns with more than 70% missing values
columns_with_high_missing = missing_percentages[missing_percentages > threshold].index

# Create a list of columns to keep
columns_to_keep = [col for col in data.columns if col not in columns_with_high_missing]

print(f"Columns with more than {threshold}% missing data: {columns_with_high_missing}")
print(f"Columns to keep: {columns_to_keep}")

Columns with more than 70% missing data: Index([], dtype='object')
Columns to keep: ['date', 'symbol', 'target', 'open', 'high', 'low', 'close', 'volume', 'name', 'keywords', 'circulating_supply', 'market_cap', 'infinite_supply', 'source_code', 'fear_greed_value', 'fear_greed_classification', 'week_start', 'week_end', 'google_trend_score', 'interest_rate_Australia', 'interest_rate_Brazil', 'interest_rate_Canada', 'interest_rate_China', 'interest_rate_France', 'interest_rate_Germany', 'interest_rate_India', 'interest_rate_Italy', 'interest_rate_Japan', 'interest_rate_Mexico', 'interest_rate_Russia', 'interest_rate_South Korea', 'interest_rate_United Kingdom', 'interest_rate_United States', 'gdp_Australia', 'gdp_Brazil', 'gdp_Canada', 'gdp_China', 'gdp_France', 'gdp_Germany', 'gdp_India', 'gdp_Italy', 'gdp_Japan', 'gdp_Korea Rep', 'gdp_Mexico', 'gdp_Russian Federation', 'gdp_Saudi Arabia', 'gdp_United Kingdom', 'gdp_United States', 'inflation_rate_Australia', 'inflation_rate_Brazil', 'in