In [1]:
import numpy as np
import pandas as pd
from functools import reduce

## Tweets

In [2]:
xrp = pd.read_csv("raw_data/XRP_stocktwits.csv").iloc[:, :2]
min(xrp.timestamp), max(xrp.timestamp)
# Need to filter > 2022 in other datasets

('2022-01-07T00:01:14.005660+00:00', '2022-02-28T23:52:34.006678')

In [3]:
xrp.to_csv("data/xrp_tweets.csv", index = False)

### cryptodatadownload data

In [4]:
def clean_binance_data(filename):
    df = pd.read_csv(filename, skiprows = 1)
    print("Shape before:", df.shape)
    df = df.drop(['unix', 'symbol'], axis = 1)
    df = df[df.date > "2022-01-01"]
    df["date"] = pd.to_datetime(df.date)
    df = df.sort_values("date")
    print("Shape after:", df.shape)
    return df

#### XRP

In [5]:
day_xrp = clean_binance_data('raw_data/Binance_XRPUSDT_d.csv')
day_xrp.to_csv("data/XRPUSDT_day.csv", index = False)

Shape before: (1398, 10)
Shape after: (60, 8)


In [6]:
hour_xrp = clean_binance_data('raw_data/Binance_XRPUSDT_1h.csv')
hour_xrp.to_csv("data/XRPUSDT_hour.csv", index = False)

Shape before: (13808, 10)
Shape after: (1417, 8)


In [7]:
minute_xrp = clean_binance_data('raw_data/Binance_XRPUSDT_minute.csv')
minute_xrp.to_csv("data/XRPUSDT_minute.csv", index = False)

Shape before: (1124679, 10)
Shape after: (85148, 8)


#### BTC

In [8]:
day_btc = clean_binance_data('raw_data/Binance_BTCUSDT_d.csv')
day_btc.to_csv("data/BTCUSDT_day.csv", index = False)

Shape before: (1662, 10)
Shape after: (63, 8)


In [9]:
hour_btc = clean_binance_data('raw_data/Binance_BTCUSDT_1h.csv')
hour_btc.to_csv("data/BTCUSDT_hour.csv", index = False)

Shape before: (42464, 10)
Shape after: (1489, 8)


In [10]:
minute_xrp = clean_binance_data('raw_data/Binance_BTCUSDT_minute.csv')
minute_xrp.to_csv("data/BTCUSDT_minute.csv", index = False)

Shape before: (1301250, 10)
Shape after: (89496, 8)


### NASDAQ

In [13]:
def clean_nasdaq_yh_data(filename):
    df = pd.read_csv(filename)
    print("Shape before: ", df.shape)
    df["Date"] = pd.to_datetime(df.Date)
    df = df[df["Date"] > "2022-01-01"]
    df = df.sort_values("Date")
    print("Shape after: ", df.shape)
    return df

#### Gold

In [15]:
gold = clean_nasdaq_yh_data('raw_data/GOLDUSD.csv')
gold.to_csv("data/GOLDUSD_day.csv", index = False)

Shape before:  (125, 6)
Shape after:  (42, 6)


#### SNP500

In [17]:
snp = clean_nasdaq_yh_data('raw_data/SNP500USD.csv')
snp.to_csv("data/SNP500USD_day.csv", index = False)

Shape before:  (125, 6)
Shape after:  (42, 6)


#### VIX 

In [18]:
vix = clean_nasdaq_yh_data('raw_data/VIXUSD.csv')
vix.to_csv('data/VIXUSD_day.csv', index = False)

Shape before:  (254, 7)
Shape after:  (43, 7)


### Joined prices

In [19]:
day_xrp["price"] = (day_xrp.open + day_xrp.high + day_xrp.low + day_xrp.close ) / 4
day_btc["price"] = (day_btc.open + day_btc.high + day_btc.low + day_btc.close ) / 4
gold["price"] = (gold.Open + gold.High + gold.Low + gold["Close/Last"]) / 4
snp["price"] = (snp.Open + snp.High + snp.Low + snp["Close/Last"]) / 4
vix["price"] = (vix.Open + vix.High + vix.Low + vix["Close"]) / 4

In [20]:
xrpjoin = day_xrp[['date', 'price', 'Volume USDT', 'tradecount']].rename({'date':'Date', 'price':'xrp', 'Volume USDT': 'XRP_volume_usdt'}, axis = 1)
btcjoin = day_btc[['date', 'price']].rename({'date':'Date', 'price':'btc'}, axis = 1)
goldjoin = gold[['Date', 'price']].rename({ 'price':'gold'}, axis = 1)
snpjoin = snp[['Date', 'price']].rename({'price':'snp'}, axis = 1)
vixjoin = vix[['Date', 'price']].rename({ 'price':'vix'}, axis = 1)

In [21]:
dfs_to_join = [xrpjoin, btcjoin, goldjoin, snpjoin, vixjoin]
df = reduce(lambda left,right: pd.merge(left,right,on='Date'), dfs_to_join)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 39
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             40 non-null     datetime64[ns]
 1   xrp              40 non-null     float64       
 2   XRP_volume_usdt  40 non-null     float64       
 3   tradecount       40 non-null     int64         
 4   btc              40 non-null     float64       
 5   gold             40 non-null     float64       
 6   snp              40 non-null     float64       
 7   vix              40 non-null     float64       
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 2.8 KB


In [22]:
df.to_csv("data/all_prices.csv", index = False)

In [23]:
df

Unnamed: 0,Date,xrp,XRP_volume_usdt,tradecount,btc,gold,snp,vix
0,2022-01-03,0.84275,149876100.0,232484,46749.57,1815.35,4782.3775,17.325
1,2022-01-04,0.82005,177691900.0,270966,46333.9125,1807.75,4797.735,16.9075
2,2022-01-05,0.785425,294783800.0,482373,44713.285,1819.8,4746.4275,18.3875
3,2022-01-06,0.77005,234402000.0,415169,43195.0075,1799.2,4696.4275,20.01
4,2022-01-07,0.762975,281919700.0,511973,42101.1525,1792.025,4686.345,19.495
5,2022-01-10,0.737125,228730500.0,359880,41396.4025,1796.625,4645.2225,20.4
6,2022-01-11,0.7544,190447500.0,285391,42230.1775,1810.65,4683.6525,19.3075
7,2022-01-12,0.785475,231115500.0,330205,43350.9875,1822.8,4727.62,17.960001
8,2022-01-13,0.78235,178544900.0,247605,43318.495,1821.8,4696.7525,19.1075
9,2022-01-14,0.77305,170797400.0,242416,42698.26,1820.525,4645.18,20.105


In [None]:
df = df.iloc[:, 1:]

In [None]:
df.plot(figsize = (15, 7))

In [None]:
standartizeddf=(df-df.mean())/df.std(ddof = 1)

In [None]:
standartizeddf.plot(figsize = (15, 7))