## Imports

In [1]:
import io
import json
import requests
import functools
import numpy as np
import pandas as pd
from tqdm import tqdm
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torchvision import datasets, models, transforms

pd.options.mode.chained_assignment = None

## Обработка csv с https://data.binance.vision/?prefix=data/spot/daily/klines/BTCUSDT/1m/

In [2]:
df = pd.read_csv('BTCUSDT_2022_Jun.csv') 

df = df.rename(columns={df.columns[0]: 'timestamp',
                        df.columns[1]: 'Open',
                        df.columns[2]: 'High',
                        df.columns[3]: 'Low',
                        df.columns[4]: 'Close',
                        df.columns[5]: 'Volume',
                        df.columns[8]: 'Count',
                        })

df = df.drop(columns = {df.columns[6], 
             df.columns[7],
             df.columns[9],
             df.columns[10],
             df.columns[11]
             })

In [3]:
df.isna().sum()

timestamp    0
Open         0
High         0
Low          0
Close        0
Volume       0
Count        0
dtype: int64

#### Nan есть только в последней строке. Дропаем.

In [4]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285131 entries, 0 to 285130
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   timestamp  285131 non-null  int64  
 1   Open       285131 non-null  float64
 2   High       285131 non-null  float64
 3   Low        285131 non-null  float64
 4   Close      285131 non-null  float64
 5   Volume     285131 non-null  float64
 6   Count      285131 non-null  int64  
dtypes: float64(5), int64(2)
memory usage: 15.2 MB


### Добавим target слобец

«target» будем считать из логарифмической доходности ($R$) за 15 минут. Для упрощения формула не будет учитывает тенденций рынка.

$${Target}(t) = log (P(t+16)\ /\ P(t+1))$$




In [5]:
df['Target'] = 0
df.head(20)

Unnamed: 0,timestamp,Open,High,Low,Close,Volume,Count,Target
0,1655337600000,22583.72,22622.47,22555.33,22568.17,104.35613,1934,0
1,1655337660000,22568.17,22606.09,22553.43,22570.41,105.16669,1831,0
2,1655337720000,22570.42,22653.85,22570.0,22631.85,133.63109,2018,0
3,1655337780000,22633.99,22714.34,22631.84,22667.71,142.14442,2500,0
4,1655337840000,22667.7,22746.25,22660.96,22740.04,118.55492,1818,0
5,1655337900000,22740.04,22759.99,22681.04,22733.98,160.44576,2352,0
6,1655337960000,22733.97,22738.03,22680.0,22684.55,115.44768,1573,0
7,1655338020000,22684.55,22724.48,22670.92,22686.08,108.4647,1698,0
8,1655338080000,22686.07,22789.85,22683.24,22779.99,133.97855,2017,0
9,1655338140000,22780.0,22806.59,22734.25,22734.25,163.47718,2397,0


In [6]:
for i in range (len(df['Open'])):
    open_price= df['Open'][i]
    max = df['High'][i:i+15:].max()
    if max >= open_price + 25:
        df['Target'][i] = 1
    else: 
        df['Target'][i] = 0

df['Target'].value_counts()

0    145694
1    139437
Name: Target, dtype: int64

In [7]:
for i in range (len(df['Open'])):
    if df['Target'][i]==1:
        print(df['timestamp'][i])

1655337600000
1655337660000
1655337720000
1655337780000
1655337840000
1655337900000
1655337960000
1655338020000
1655338080000
1655338140000
1655338260000
1655338320000
1655338380000
1655338620000
1655338680000
1655338740000
1655338800000
1655338860000
1655338920000
1655338980000
1655339040000
1655339100000
1655339160000
1655339220000
1655339280000
1655339340000
1655339400000
1655339460000
1655339520000
1655339580000
1655339640000
1655339700000
1655339760000
1655339820000
1655340060000
1655340120000
1655340180000
1655340240000
1655340300000
1655340360000
1655340420000
1655340480000
1655340540000
1655340600000
1655340660000
1655340720000
1655340780000
1655340840000
1655340900000
1655340960000
1655341020000
1655341080000
1655341140000
1655341200000
1655341260000
1655341320000
1655341380000
1655341440000
1655341500000
1655341560000
1655341620000
1655341680000
1655341740000
1655341800000
1655341860000
1655341920000
1655341980000
1655342040000
1655342100000
1655342160000
1655342220000
165534

In [8]:
df = df[df['Target'] != '']

df['Target'] = df['Target'].astype(object).astype(float)

df

Unnamed: 0,timestamp,Open,High,Low,Close,Volume,Count,Target
0,1655337600000,22583.72,22622.47,22555.33,22568.17,104.35613,1934,1.0
1,1655337660000,22568.17,22606.09,22553.43,22570.41,105.16669,1831,1.0
2,1655337720000,22570.42,22653.85,22570.00,22631.85,133.63109,2018,1.0
3,1655337780000,22633.99,22714.34,22631.84,22667.71,142.14442,2500,1.0
4,1655337840000,22667.70,22746.25,22660.96,22740.04,118.55492,1818,1.0
...,...,...,...,...,...,...,...,...
285126,1672445160000,16599.34,16600.32,16594.37,16594.44,68.09939,2465,0.0
285127,1672445220000,16594.44,16600.91,16593.56,16599.84,88.83927,2709,0.0
285128,1672445280000,16599.84,16601.74,16594.96,16599.30,83.76568,2463,0.0
285129,1672445340000,16599.30,16600.21,16593.64,16595.43,44.73432,1797,0.0


#### " 0   timestamp  886125 non-null  object" => object -> int 

In [9]:
df['timestamp'] = df['timestamp'].astype(str).astype(float)
df['timestamp'] = df['timestamp'].div(1000).astype(float).astype(int)


df

Unnamed: 0,timestamp,Open,High,Low,Close,Volume,Count,Target
0,1655337600,22583.72,22622.47,22555.33,22568.17,104.35613,1934,1.0
1,1655337660,22568.17,22606.09,22553.43,22570.41,105.16669,1831,1.0
2,1655337720,22570.42,22653.85,22570.00,22631.85,133.63109,2018,1.0
3,1655337780,22633.99,22714.34,22631.84,22667.71,142.14442,2500,1.0
4,1655337840,22667.70,22746.25,22660.96,22740.04,118.55492,1818,1.0
...,...,...,...,...,...,...,...,...
285126,1672445160,16599.34,16600.32,16594.37,16594.44,68.09939,2465,0.0
285127,1672445220,16594.44,16600.91,16593.56,16599.84,88.83927,2709,0.0
285128,1672445280,16599.84,16601.74,16594.96,16599.30,83.76568,2463,0.0
285129,1672445340,16599.30,16600.21,16593.64,16595.43,44.73432,1797,0.0


In [10]:
df =  df.set_index("timestamp")

In [11]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Count,Target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1655337600,22583.72,22622.47,22555.33,22568.17,104.35613,1934,1.0
1655337660,22568.17,22606.09,22553.43,22570.41,105.16669,1831,1.0
1655337720,22570.42,22653.85,22570.00,22631.85,133.63109,2018,1.0
1655337780,22633.99,22714.34,22631.84,22667.71,142.14442,2500,1.0
1655337840,22667.70,22746.25,22660.96,22740.04,118.55492,1818,1.0
...,...,...,...,...,...,...,...
1672445160,16599.34,16600.32,16594.37,16594.44,68.09939,2465,0.0
1672445220,16594.44,16600.91,16593.56,16599.84,88.83927,2709,0.0
1672445280,16599.84,16601.74,16594.96,16599.30,83.76568,2463,0.0
1672445340,16599.30,16600.21,16593.64,16595.43,44.73432,1797,0.0


In [12]:
df.to_csv("train_2022_Jun.csv")