In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

TIME_SERIES_SIZE = 256
IN_PRICES_DATA_FOLDER = "clean_data/prices/"
IN_TWEETS_DATA_FOLDER = "clean_data/tweets/"
OUT_DATA_FOLDER = "train_val_data/"
USEROWS = 100000

def prepare_dataset(ticker):
    tweets = pd.read_csv(IN_TWEETS_DATA_FOLDER + ticker + "_tweets.csv", lineterminator='\n', parse_dates=['timestamp']).head(USEROWS)
    prices = pd.read_csv(IN_PRICES_DATA_FOLDER + ticker + "USDT_minute.csv", parse_dates=['Date'])
    prices["price"] = (prices.Open + prices.Close) / 2
    timeseries = pd.DataFrame([], columns=[str(i) for i in range(TIME_SERIES_SIZE)])
    indexes = []
    for i in tqdm(range(tweets.timestamp.shape[0])):
        tempdata = prices[prices.Date>=tweets.timestamp[i]].price.head(TIME_SERIES_SIZE)
        tempdata = (100 * (tempdata / tempdata.iat[0] - 1))
        if tempdata.shape[0] != 0:
            indexes.append(i)
            tempdf = pd.DataFrame(tempdata.array.reshape(1, TIME_SERIES_SIZE), columns=[str(i) for i in range(TIME_SERIES_SIZE)])
            timeseries = pd.concat([timeseries, tempdf])
    timeseries.reset_index(drop=True, inplace=True)
    df =  pd.concat([tweets.iloc[indexes], timeseries], axis=1)
    df = df.drop(['username', 'timestamp'], axis=1)
    train, val = train_test_split(df, test_size=0.25, random_state=42)
    train.to_csv("train_val_data/" + ticker + "_train.csv", index=False)
    val.to_csv("train_val_data/" + ticker + "_val.csv", index=False)

In [None]:
for ticker in ["BTC", "ETH", "XRP"]:
    prepare_dataset(ticker)