In [1]:
import pandas as pd

from utils import (
    OHLCV_FILE_PATH,
    FEAR_GREED_DATA_FILE_PATH,
    COIN_DETAILS_FILE_PATH,
    MACRO_ECO_FACTORS_DATA_FILE_PATH,
    GOOGLE_TRENDS_DATA_FILE_PATH, 
    COMPLETE_DATA_FILE_PATH,
    SAMPLE_DATA_FILE_PATH,
    get_week_start,
    get_week_end)

TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-05


# Integrate OHLCV Data

In [2]:
ohlcv_data = pd.read_csv(OHLCV_FILE_PATH)
ohlcv_data['date'] = pd.to_datetime(ohlcv_data['date'], errors='coerce').dt.date

print(f'Rows of ohlcv data: {ohlcv_data.shape[0]}')
ohlcv_data.head()


Rows of ohlcv data: 1654463


Unnamed: 0,symbol,date,open,high,low,close,volume
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147


# Integrate coin details data

In [3]:
coin_info_data = pd.read_csv(COIN_DETAILS_FILE_PATH, index_col=None)

print(f'Rows of coin data: {coin_info_data.shape[0]}')
coin_info_data.head()

Rows of coin data: 1439


Unnamed: 0,symbol,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active
0,NEO,Neo,Neo,100000000.0,70538830.0,651943100.0,0,1,https://github.com/neo-project/neo,1
1,FARA,FaraLand,FaraLand,99930070.0,22928680.0,274767.4,0,0,,1
2,ALI,Artificial Liquid Intelligence,Artificial Liquid Intelligence,9872434000.0,6180790000.0,51509740.0,0,1,https://github.com/AI-Protocol-Official,1
3,HONEY,Hivemapper,Hivemapper,6342243000.0,2314560000.0,160370400.0,0,1,https://github.com/hivemapper,1
4,STRAX,Stratis [New],Stratis coin,1951821000.0,1951815000.0,86076230.0,1,1,https://github.com/stratisproject,1


In [4]:
# Merge the coin name with the OHLCV, Fear & Greed Index
data = ohlcv_data.merge(coin_info_data, on='symbol', how='left')

print(f'Number of rows in the data: {data.shape[0]}')
data.head()

Number of rows in the data: 1654463


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1


# Integrate Fear & Greed Index

In [5]:
fear_greed_df = pd.read_csv(FEAR_GREED_DATA_FILE_PATH)
fear_greed_df['date'] = pd.to_datetime(fear_greed_df['date'], errors='coerce').dt.date

print(f'Rows of fear & greed data: {fear_greed_df.shape[0]}')
fear_greed_df.head()

Rows of fear & greed data: 1533


Unnamed: 0,value,value_classification,date
0,50,Neutral,2020-06-01
1,56,Greed,2020-06-02
2,48,Neutral,2020-06-03
3,54,Neutral,2020-06-04
4,53,Neutral,2020-06-05


In [6]:
data = data.merge(fear_greed_df[['date', 'value', 'value_classification']], left_on='date', right_on='date', how='left')
data.rename(columns={'value': 'fear_greed_value'}, inplace=True)  # Rename the value column
data.rename(columns={'value_classification': 'fear_greed_classification'}, inplace=True)  # Rename the value column

print(f'Number of rows in the data: {data.shape[0]}')
data.head()

Number of rows in the data: 1654463


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,50.0,Neutral
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,56.0,Greed
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,48.0,Neutral
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,54.0,Neutral
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,53.0,Neutral


# Integrate Google Trends data

In [7]:
trends_data = pd.read_csv(GOOGLE_TRENDS_DATA_FILE_PATH)

trends_data['date'] = pd.to_datetime(trends_data['date'])
trends_data['date'] = trends_data['date'].dt.date

print(f'Number of rows in google trends data: {trends_data.shape[0]}')
trends_data.head()

Number of rows in google trends data: 311522


Unnamed: 0,date,symbol,name,trend_score
0,2020-06-07,BNX,BinaryX,0
1,2020-06-07,STEP,Step Finance,19
2,2020-06-07,METAL,BADMAD ROBOTS,0
3,2020-06-07,KIN,Kin,40
4,2020-06-07,BRZE,Breeze coin,0


In [8]:
# Apply the function to create a 'week_start' & 'week_end' column in the main data
data['week_start'] = data['date'].apply(get_week_start)
data['week_end'] = data['date'].apply(get_week_end)
trends_data['week_start'] = trends_data['date'].apply(get_week_start)

data['week_start'] = pd.to_datetime(data['week_start'])
data['week_start'] = pd.to_datetime(data['week_start'])
trends_data['week_start'] = pd.to_datetime(trends_data['week_start'])

data.head()


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,circulating_supply,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification,week_start,week_end
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,50.0,Neutral,2020-06-01,2020-06-07
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,56.0,Greed,2020-06-01,2020-06-07
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,48.0,Neutral,2020-06-01,2020-06-07
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,54.0,Neutral,2020-06-01,2020-06-07
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,9706750.0,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,53.0,Neutral,2020-06-01,2020-06-07


In [9]:
# Set 'symbol' and 'week_start' as the index for the trends data and forward-fill
trends_data.set_index(['symbol', 'week_start'], inplace=True)
trends_data = trends_data.groupby('symbol').ffill().reset_index()

# Merge the OHLCV data with the forward-filled trends data
data = pd.merge(data, trends_data, on=['symbol', 'week_start'], how='left')

data = data.drop(columns=['date_y', 'name_y'])
data = data.rename(columns={'date_x': 'date', 'name_x': 'name', 'trend_score': 'google_trend_score'})

# Check the merged data
print(f'Number of rows in the data: {data.shape[0]}')
data.head()


Number of rows in the data: 1654463


Unnamed: 0,symbol,date,open,high,low,close,volume,name,keywords,total_supply,...,market_cap,infinite_supply,is_open_source,source_code,is_active,fear_greed_value,fear_greed_classification,week_start,week_end,google_trend_score
0,0xBTC,2020-06-01,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,50.0,Neutral,2020-06-01,2020-06-07,0.0
1,0xBTC,2020-06-02,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,56.0,Greed,2020-06-01,2020-06-07,0.0
2,0xBTC,2020-06-03,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,48.0,Neutral,2020-06-01,2020-06-07,0.0
3,0xBTC,2020-06-04,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,54.0,Neutral,2020-06-01,2020-06-07,0.0
4,0xBTC,2020-06-05,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,...,539703.373539,0,1,https://github.com/0xbitcoin/0xbitcoin-token,1,53.0,Neutral,2020-06-01,2020-06-07,0.0


# Integrate Macro-Economic Factors

In [10]:
data['date'] = pd.to_datetime(data['date'])

economic_data = pd.read_csv(MACRO_ECO_FACTORS_DATA_FILE_PATH)
economic_data['date'] = pd.to_datetime(economic_data['date'])

# Set 'date' as the index
data.set_index('date', inplace=True)
economic_data.set_index('date', inplace=True)

# Resample monthly data to daily frequency and forward-fill values
economic_data_resampled = economic_data.resample('D').ffill()

# Reset index to get the date column back
economic_data_resampled.reset_index(inplace=True)

# Merge the two DataFrames on the date column
data = pd.merge(data, economic_data_resampled, on='date', how='left')

print(f'Number of rows in the data: {data.shape[0]}')
data.head(100)

Number of rows in the data: 1654463


Unnamed: 0,date,symbol,open,high,low,close,volume,name,keywords,total_supply,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
1,2020-06-02,0xBTC,0.145457,0.153313,0.129571,0.134045,1096847,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
2,2020-06-03,0xBTC,0.134045,0.153145,0.126926,0.140528,1139340,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
3,2020-06-04,0xBTC,0.140615,0.144362,0.123043,0.130178,1164467,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
4,2020-06-05,0xBTC,0.130178,0.134866,0.125276,0.126849,1127147,0xBitcoin,0xBitcoin,20999984.0,...,124.0019,123.8370,105.027448,122.5766,101.8071,107.8078,96.96129,106.972710,106.01,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-09-04,0xBTC,0.088969,0.102752,0.079269,0.097443,1850641,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
96,2020-09-05,0xBTC,0.097426,0.102197,0.077869,0.083904,1485673,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
97,2020-09-06,0xBTC,0.083904,0.097919,0.078028,0.088231,1464089,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0
98,2020-09-07,0xBTC,0.088232,0.093056,0.081729,0.092534,1487425,0xBitcoin,0xBitcoin,20999984.0,...,126.1710,124.2327,105.607244,124.0939,101.6035,107.9530,96.53792,107.533195,105.27,2020.0


# Integrate train targets

In [11]:
# train_targets_df = pd.read_parquet(TRAIN_TARGETS_PARQUET_FILE_PATH)
# train_targets_df['date'] = pd.to_datetime(train_targets_df['date'])

# data = pd.merge(data, train_targets_df, on=['symbol', 'date'], how='left')
# data['target'] = data['target'].fillna(0)
# cols = list(data.columns)
# new_order = ['date', 'symbol', 'target'] + [col for col in cols if col not in ['date', 'symbol', 'target']]
# data = data[new_order]

# # Filter out rows where 'date' falls on weekends
# data = data[data['date'].dt.weekday < 5]

# print(f'Number of rows in the data: {data.shape[0]}')
# data.head(20)

# Save the complete dataset

In [12]:
data = data.sort_values(by='date', ascending=True)

print(f'Saving the complete dataset with {data.shape[0]} rows and {len(data.columns)} columns to {COMPLETE_DATA_FILE_PATH}')

data.reset_index()
data.to_parquet(COMPLETE_DATA_FILE_PATH, index=False)

sample = data.sample(n=1000, random_state=42)
sample.to_csv(SAMPLE_DATA_FILE_PATH)
print(f'Saving the a sample of the dataset with {sample.shape[0]} rows and {len(sample.columns)} columns to {SAMPLE_DATA_FILE_PATH}')

Saving the complete dataset with 1654463 rows and 81 columns to ../data/complete_data.parquet
Saving the a sample of the dataset with 1000 rows and 81 columns to ../data/sample_data.csv


In [13]:
data.head(100)

Unnamed: 0,date,symbol,open,high,low,close,volume,name,keywords,total_supply,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,2.099998e+07,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
1026792,2020-06-01,BTT,0.000300,0.000312,0.000298,0.000311,67136753,BitTorrent [New],BitTorrent coin,9.900000e+14,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
16177,2020-06-01,ASD,0.037355,0.038924,0.037070,0.038295,2117671,ASD,ASD,7.806153e+08,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
476507,2020-06-01,NWC,0.021969,0.022889,0.021884,0.022715,238352,Numerico,Numerico,2.700505e+08,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
473474,2020-06-01,DAD,0.242849,0.408407,0.242230,0.272164,10332685,DAD,DAD,1.000000e+09,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433721,2020-06-01,FO,0.011748,0.011910,0.011311,0.011487,623063,FIBOS,FIBOS,1.075772e+09,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
19245,2020-06-01,AXE,0.234023,0.236895,0.228970,0.235851,1278480,Axe,Axe,5.260968e+06,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
432187,2020-06-01,PERL,0.016943,0.018841,0.016870,0.017971,1838056,PERL.eco,PERL.eco,1.033200e+09,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
430653,2020-06-01,CENNZ,0.075302,0.080894,0.072094,0.080242,89470,CENNZnet,CENNZnet,1.200000e+09,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0


: 