In [2]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

## Load all files

In [3]:
file_data = pd.read_csv("file_data.csv")
database_data = pd.read_csv("database_data.csv")
website_data = pd.read_csv("website_data.csv")
api_data = pd.read_csv("api_data.csv")

### Lower all columns to match file data

In [4]:
file_data.columns= file_data.columns.str.lower()
database_data.columns= database_data.columns.str.lower()
website_data.columns= website_data.columns.str.lower()
api_data.columns= api_data.columns.str.lower()

### Set timestamp as index

In [5]:
file_data = file_data.set_index("timestamp")
database_data = database_data.set_index("timestamp")
website_data = website_data.set_index("timestamp")
api_data = api_data.set_index("timestamp")

### Merge all dataframes on timestamp

In [6]:
# concatenate dataframes - by default, pd.concat uses an outer join on the index. pd.merge could be an alternative
all_data_df = pd.concat([file_data, database_data, website_data, api_data],  axis=1).sort_index()
print("Concatenated shape: ", all_data_df.shape)

Concatenated shape:  (499996, 20)


In [7]:
all_data_df

Unnamed: 0_level_0,open,high,low,close,volume_(btc),volume_(currency),weighted_price,volumn_square,ma10,rsi30,rsi10,%k10,ema200,ma200,rsi200,%k30,rsi10,%k200,%k30,ema10
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1.325318e+09,4.39,4.39,4.39,4.39,0.455581,2.0,4.39,,,,,,,,,,,,,
1.325318e+09,,,,,,,,,,,,,,,,,,,,
1.325318e+09,,,,,,,,,,,,,,,,,,,,
1.325318e+09,,,,,,,,,,,,,,,,,,,,
1.325318e+09,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.355317e+09,,,,,,,,,,,,,,,,,,,,
1.355317e+09,,,,,,,,,,,,,,,,,,,,
1.355318e+09,,,,,,,,,,,,,,,,,,,,
1.355318e+09,,,,,,,,,,,,,,,,,,,,


In [8]:
all_data_df.to_csv("all_data.csv")

## Merge with signal

In [9]:
signal = pd.read_csv("../data/signal.csv")

In [10]:
signal = signal.drop("Unnamed: 0", axis =1)
signal= signal.set_index("timestamp")

In [11]:
feats_full = all_data_df.merge(signal, on= "timestamp", left_index=False, right_index=False, how = "inner")
feats_full

Unnamed: 0_level_0,open,high,low,close,volume_(btc),volume_(currency),weighted_price,volumn_square,ma10,rsi30,...,%k10,ema200,ma200,rsi200,%k30,rsi10,%k200,%k30,ema10,signal
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.325601e+09,5.14,5.14,5.14,5.14,0.680000,3.495200,5.140000,,4.781,,...,80.645161,,,,,,,,4.941577,0.0
1.325602e+09,5.26,5.26,5.26,5.26,29.319392,154.220000,5.260000,,4.868,,...,93.548387,,,,,,,,5.006627,0.0
1.325605e+09,5.29,5.29,5.29,5.29,29.302457,155.010000,5.290000,,4.958,,...,96.341463,,,,,,,,5.063244,0.0
1.325611e+09,5.29,5.29,5.29,5.29,11.285444,59.700000,5.290000,,5.030,,...,95.945946,,,,,,,,5.107749,0.0
1.325611e+09,5.14,5.14,5.14,5.14,0.020000,0.102800,5.140000,,5.086,,...,75.675676,,,,,,,,5.113989,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.351990e+09,10.68,10.68,10.68,10.68,1.904494,20.340000,10.680000,,10.632,53.396343,...,100.000000,10.625978,10.59670,50.050111,100.000000,,,,10.655024,0.0
1.351992e+09,10.64,10.64,10.45,10.45,54.200000,568.976056,10.497713,,10.620,41.725604,...,0.000000,10.624227,10.59605,48.503938,0.000000,,,,10.617747,0.0
1.351994e+09,10.64,10.64,10.64,10.64,1.957707,20.830000,10.640000,,10.632,50.897150,...,82.608696,10.624384,10.59565,49.791687,82.608696,,,,10.621793,0.0
1.351995e+09,10.63,10.63,10.63,10.63,5.200000,55.276000,10.630000,,10.638,50.464714,...,78.260870,10.624440,10.59520,49.725912,78.260870,,,,10.623285,0.0


In [12]:
feats_full.columns

Index(['open', 'high', 'low', 'close', 'volume_(btc)', 'volume_(currency)',
       'weighted_price', 'volumn_square', 'ma10', 'rsi30', 'rsi10', '%k10',
       'ema200', 'ma200', 'rsi200', '%k30', 'rsi10', '%k200', '%k30', 'ema10',
       'signal'],
      dtype='object')

In [87]:
feats_full.to_csv("train.csv")