In [119]:
import cupy as cp
import pandas as pd 
import numpy as np
import math
from scipy.fft import fft
from tqdm import tqdm

In [12]:
%%time
def buy_and_sell_orders_from_dataframe(df : pd.DataFrame) -> list:
    minima = argrelextrema(df['Close'].to_numpy(), np.less)[0]
    maxima = argrelextrema(df['Close'].to_numpy(), np.greater)[0]
    minmax_flag = list()
    for i in range(len(df)):
        if i in minima:
            minmax_flag.append(1) # local minima buy order
            continue
        if i in maxima:
            minmax_flag.append(2) # local maxima sell order
            continue
        minmax_flag.append(0)
    return minmax_flag

def add_lag_feature(df : pd.DataFrame, lags : int) -> pd.DataFrame:
    for i in range(1,lags+1):
        df[f'lag_{i}'] = df.Close.shift(i)
    return df

def add_rolling_mean(df : pd.DataFrame, window_size=3) -> pd.DataFrame:
    df[f'rma_{window_size}'] = df.Close.rolling(window=window_size).mean()
    return df

def add_rolling_var(df : pd.DataFrame, window_size=3) -> pd.DataFrame:
    df[f'rva_{window_size}'] = df.Close.rolling(window=window_size).var()
    df[f'rstda_{window_size}'] = df.Close.rolling(window=window_size).std()
    return df

def apply_fourier_transform(df : pd.DataFrame) -> pd.DataFrame:
    values = df['Close'].values
    fourier_transform = fft(values)
    df['fourier_transform'] = np.abs(fourier_transform)
    return df

def add_first_order_derivative(df : pd.DataFrame) -> pd.DataFrame:
    df['1der'] = df.Close.diff()
    return df

def add_second_order_derivative(df : pd.DataFrame) -> pd.DataFrame:
    df['2der'] = df['1der'].diff()
    return df

def add_math_expressions(df : pd.DataFrame) -> pd.DataFrame:
    df['log_close'] = np.log(df.Close)
    df['log_1der'] = df['1der'].apply(lambda der : np.log(der) if der != 0 else np.nan)
    df['log_return'] = df.apply(lambda x : np.log(np.absolute(x['Close']/x['1der'])) if x['1der'] != 0 else np.nan,axis=1)# np.log(df.Close/df['1der'])
    return df

def add_adjusted_volatility(df : pd.DataFrame) -> pd.DataFrame:
    norm_r = np.power(np.sum(np.power(df.Close,2)),1./2)
    mean_r = np.mean(norm_r)
    std_r = np.std(norm_r)
    mean_c = np.mean(df.Close)
    std_c = np.std(df.Close)
    df['adjusted_volatility'] = (std_r/std_c)*(norm_r - mean_c + (std_c/std_r) * mean_r) if not math.isclose(std_c, 0, abs_tol=1e-8) and not math.isclose(std_r, 0, abs_tol=1e-8) else 0 
    return df

def add_macd(df : pd.DataFrame,ema12,ema26) -> pd.DataFrame:
    df[f'EMA{ema12}'] = df.Close.ewm(alpha=1/ema12, adjust=False).mean()
    df[f'EMA{ema26}'] = df.Close.ewm(alpha=1/ema26, adjust=False).mean()
    df['MACD'] = df.EMA12 - df.EMA26
    return df

def add_rsi(df: pd.DataFrame,window) -> pd.DataFrame: 
    change = df["Close"].diff()
    change_up = change.copy()
    change_down = change.copy()
    change_up[change_up<0] = 0
    change_down[change_down>0] = 0
    avg_up = change_up.rolling(window).mean()
    avg_down = change_down.rolling(window).mean().abs()
    df[f'rsi_{window}'] = 100 * avg_up / (avg_up + avg_down)
    return df

def add_williams_r(df: pd.DataFrame, window) -> pd.DataFrame:
    highh = df.High.rolling(window).max() 
    lowl = df.Low.rolling(window).min()
    df[f'williams_r_{window}'] = -100 * ((highh - df.Close) / (highh - lowl))
    return df

def add_stochastic_oscillator(df: pd.DataFrame,window) -> pd.DataFrame:
    d = df.Close.rolling(window).mean()
    highh = df.High.max() 
    lowl = df.Low.min()
    k = (df.Close - lowl)/(highh - lowl)
    df[f'stochastic_oscillator_{window}'] = k - d
    return df

def add_price_rate_of_change(df : pd.DataFrame,shift) -> pd.DataFrame:
    c_12 = df.Close.shift(shift)
    df[f'proc_{shift}'] = (df.Close - c_12)/c_12
    return df

def add_ADX(df: pd.DataFrame, period: int):

    alpha = 1/period
    # TR
    df['H-L'] = df['High'] - df['Low']
    df['H-C'] = np.abs(df['High'] - df['Close'].shift(1))
    df['L-C'] = np.abs(df['Low'] - df['Close'].shift(1))
    df['TR'] = df[['H-L', 'H-C', 'L-C']].max(axis=1)
    del df['H-L'], df['H-C'], df['L-C']
    # ATR
    df['ATR'] = df['TR'].ewm(alpha=alpha, adjust=False).mean()
    # +-DX
    df['H-pH'] = df['High'] - df['High'].shift(1)
    df['pL-L'] = df['Low'].shift(1) - df['Low']
    df['+DX'] = np.where(
        (df['H-pH'] > df['pL-L']) & (df['H-pH']>0),
        df['H-pH'],
        0.0
    )
    df['-DX'] = np.where(
        (df['H-pH'] < df['pL-L']) & (df['pL-L']>0),
        df['pL-L'],
        0.0
    )
    del df['H-pH'], df['pL-L']
    # +- DMI
    df['S+DM'] = df['+DX'].ewm(alpha=alpha, adjust=False).mean()
    df['S-DM'] = df['-DX'].ewm(alpha=alpha, adjust=False).mean()
    df['+DMI'] = (df['S+DM']/df['ATR'])*100
    df['-DMI'] = (df['S-DM']/df['ATR'])*100
    del df['S+DM'], df['S-DM']
    # ADX
    df['DX'] = (np.abs(df['+DMI'] - df['-DMI'])/(df['+DMI'] + df['-DMI']))*100
    df[f'ADX_{period}'] = df['DX'].ewm(alpha=alpha, adjust=False).mean()
    del df['DX'], df['ATR'], df['TR'], df['-DX'], df['+DX'], df['+DMI'], df['-DMI']
    return df

def add_Bollinger_bands(df : pd.DataFrame, window) -> pd.DataFrame:
    df[f'Mid_band_{window}'] = df['Close'].rolling(window=window).mean()
    df[f'Upper_band_{window}'] = df[f'Mid_band_{window}'] + 2 * df['Close'].rolling(window=window).std()
    df[f'Lower_band_{window}'] = df[f'Mid_band_{window}'] - 2 * df['Close'].rolling(window=window).std()
    return df

def add_sp500(df : pd.DataFrame,sp500) -> pd.DataFrame:
    df.reset_index("Date",inplace=True)
    sp500 = sp500[['Close','Date']].rename({'Close':'sp500'},axis=1)
    tmp_data = df.copy(deep=True)
    tmp_data.Date = df.Date.astype(str)
    sp500.Date = sp500.Date.astype(str)
    tmp_df = tmp_data.merge(sp500,on="Date",how="left")
    tmp_df.Date = pd.to_datetime(tmp_df.Date)
    tmp_df = tmp_df.set_index('Date')
    return tmp_df

def add_beta(df : pd.DataFrame) -> pd.DataFrame:
    df['re'] = df.Close.pct_change()
    df['rm'] = df.sp500.pct_change()
    var_rm = np.var(df.rm.to_numpy())
    cov = np.cov(df.rm.to_numpy(),df.re.to_numpy())[0][1]
    del df['re'], df['rm']
    df.beta = cov/var_rm
    return df

def add_percentage_change(df : pd.DataFrame) -> pd.DataFrame:
    df['pct_change'] = df.Close.pct_change()
    return df
    
def add_time_feature(df : pd.DataFrame) -> pd.DataFrame:
    df.reset_index("Date",inplace=True)
    df['year'] = df.Date.dt.year
    df['month'] = df.Date.dt.month
    df['day'] = df.Date.dt.dayofweek
    df.set_index('Date',inplace=True)
    return df

def add_interest_rate(df : pd.DataFrame) -> pd.DataFrame:
    tmp_data = df.reset_index().copy(deep=True)
    tmp_data.Date = tmp_data.Date.astype(str)
    int_rate.tmp_date = int_rate.tmp_date.astype(str)
    
    tmp_data['tmp_date'] = tmp_data['Date'].apply(lambda date : str(date)[:-3])
    
    tmp_data = tmp_data.merge(int_rate, on="tmp_date", how="left")
    tmp_data.Date = pd.to_datetime(tmp_data.Date)
    tmp_data = tmp_data.set_index('Date')
    tmp_data.drop('tmp_date',axis=1,inplace=True)
    return tmp_data

def create_dataframe(data):
    symb = np.unique(data.symb)
    dataframes = []
    
    for token in symb:
        tmp_df = data[data.symb == token].copy()
        tmp_df = add_lag_feature(tmp_df,15)
        tmp_df = add_rolling_mean(tmp_df,3)
        tmp_df = add_rolling_mean(tmp_df,10)
        tmp_df = add_rolling_mean(tmp_df,30)
        tmp_df = add_rolling_mean(tmp_df,50)
        tmp_df = add_rolling_mean(tmp_df,65)
        tmp_df = add_rolling_mean(tmp_df,80)
        tmp_df = add_rolling_mean(tmp_df,100)
        tmp_df = add_rolling_var(tmp_df,3)
        tmp_df = add_rolling_var(tmp_df,10)
        tmp_df = add_rolling_var(tmp_df,30)
        tmp_df = add_rolling_var(tmp_df,50)
        tmp_df = add_rolling_var(tmp_df,65)
        tmp_df = add_rolling_var(tmp_df,80)
        tmp_df = add_rolling_var(tmp_df,100)
        tmp_df = add_first_order_derivative(tmp_df)
        tmp_df = add_second_order_derivative(tmp_df)
        tmp_df = apply_fourier_transform(tmp_df)
        tmp_df = add_math_expressions(tmp_df)
        tmp_df =add_adjusted_volatility(tmp_df)
        tmp_df =add_macd(tmp_df,12,26)
        tmp_df =add_macd(tmp_df,6,18)
        tmp_df =add_macd(tmp_df,3,9)
        tmp_df =add_macd(tmp_df,2,6)
        tmp_df =add_macd(tmp_df,24,40)
        tmp_df =add_macd(tmp_df,10,29)
        tmp_df =add_rsi(tmp_df,3)
        tmp_df =add_rsi(tmp_df,5)
        tmp_df =add_rsi(tmp_df,10)
        tmp_df =add_rsi(tmp_df,14)
        tmp_df =add_rsi(tmp_df,25)
        tmp_df =add_rsi(tmp_df,30)
        tmp_df =add_williams_r(tmp_df,3)
        tmp_df =add_williams_r(tmp_df,5)
        tmp_df =add_williams_r(tmp_df,10)
        tmp_df =add_williams_r(tmp_df,14)
        tmp_df =add_williams_r(tmp_df,25)
        tmp_df =add_williams_r(tmp_df,30)
        tmp_df =add_stochastic_oscillator(tmp_df,3)
        tmp_df =add_stochastic_oscillator(tmp_df,5)
        tmp_df =add_stochastic_oscillator(tmp_df,10)
        tmp_df =add_stochastic_oscillator(tmp_df,14)
        tmp_df =add_stochastic_oscillator(tmp_df,23)
        tmp_df =add_price_rate_of_change(tmp_df,3)
        tmp_df =add_price_rate_of_change(tmp_df,5)
        tmp_df =add_price_rate_of_change(tmp_df,12)
        tmp_df =add_price_rate_of_change(tmp_df,20)
        tmp_df =add_price_rate_of_change(tmp_df,24)
        tmp_df =add_price_rate_of_change(tmp_df,30)
        tmp_df =add_ADX(tmp_df,3)
        tmp_df =add_ADX(tmp_df,5)
        tmp_df =add_ADX(tmp_df,14)
        tmp_df =add_ADX(tmp_df,25)
        tmp_df =add_ADX(tmp_df,30)
        tmp_df =add_Bollinger_bands(tmp_df,3)
        tmp_df =add_Bollinger_bands(tmp_df,5)
        tmp_df =add_Bollinger_bands(tmp_df,10)
        tmp_df =add_Bollinger_bands(tmp_df,20)
        tmp_df =add_Bollinger_bands(tmp_df,30)
        tmp_df =add_Bollinger_bands(tmp_df,50)
        tmp_df =add_sp500(tmp_df,sp500)
        tmp_df =add_beta(tmp_df)
        tmp_df =add_time_feature(tmp_df)
        tmp_df =add_percentage_change(tmp_df)
        tmp_df =add_interest_rate(tmp_df)
        dataframes.append(tmp_df)
    return pd.concat(dataframes)    
#full_df["symb"] = full_df["symb"].astype(CategoricalDtype(categories=symb))

Wall time: 0 ns


In [13]:
sp500 = pd.read_csv('sp500.csv')
int_rate = pd.read_csv('FEDFUNDS.csv')
int_rate.DATE = pd.to_datetime(int_rate.DATE)
int_rate.rename({'DATE':'Date'},axis=1,inplace=True)
int_rate.Date = int_rate['Date'].apply(lambda date : str(date)[:-12])
int_rate.rename({'Date':'tmp_date'},axis=1,inplace=True)
int_rate

Unnamed: 0,tmp_date,FEDFUNDS
0,1954-07,0.80
1,1954-08,1.22
2,1954-09,1.07
3,1954-10,0.85
4,1954-11,0.83
...,...,...
834,2024-01,5.33
835,2024-02,5.33
836,2024-03,5.33
837,2024-04,5.33


In [91]:
df = pd.read_csv('dataframe_1day.csv')
df = df.set_index('Date')
df = create_dataframe(df)

In [92]:
df = df.drop('symb',axis=1)
df = df.reset_index().drop('Date',axis=1)

In [125]:
df.astype('float16').dtypes

Open          float16
High          float16
Low           float16
Close         float16
Adj Close     float16
               ...   
year          float16
month         float16
day           float16
pct_change    float16
FEDFUNDS      float16
Length: 113, dtype: object

In [131]:
n_neigh = 5
df.dropna(axis=0, how='all')
def KnnImputer(df,n_neigh=5):
    #if(not df.isnull().values.any()):
    #    return df
    blockSize = 256
    gridSize = int((len(df)+blockSize-1)/blockSize)
    for i in tqdm(range(df.isna().values.sum())):
        for i, val in df.iterrows():
            for feat_name in val.index:
                if np.isnan(val[feat_name]):
                    # We add 0 when nan is present because we can't compute the distance between a nan and a float
                    dist_kernel = cp.RawKernel(r'''
                    extern "C" __global__
                    void get_distances(const float* dataset, const float* query, float* distances, int num_feat, int *indices, int n_neigh) {
                        int gti = blockDim.x * blockIdx.x + threadIdx.x;
                        float dist = 0;
                        for (int i = 0 ; i < num_feat ; i++) {
                            if((dataset[gti * num_feat + i]!=dataset[gti * num_feat + i]) || (query[i] != query[i])){
                                continue;
                            } else { 
                                dist = dataset[gti * num_feat + i] - query[i],2;
                                dist *= dist;
                            }
                        }
                        distances[gti] = sqrtf(dist);
                        indices[gti] = gti
                        for (int i = 0; i < num_feat/2; i++)
                        {
                            int j = threadIdx.y;
                            if (j % 2 == 0 && j<num_feat-1)
                                if (distances[j+1] < distances[j])
                                    swap(distances[j+1], distances[j]);
                                    swap(indices[j+1], indices[j]);
                            __syncthreads();
                            if (j % 2 == 1 && j<num_feat-1)
                                if (distances[j+1] < distances[j])
                                    swap(distances[j+1], distances[j]);
                                    swap(indices[j+1], indices[j]);
                            __syncthreads();
                        }
                        su
                        for(int i = 0 ; i < n_neigh ; i++){
                            dataset[indices[i]]
                        }
                    }
                    ''', 'get_distances')
                    print(distances)
                    dataset = cp.asarray(df.to_numpy().flatten()) # we use 1D array to facilitate the prralelization
                    query = cp.asarray(val.to_numpy())
                    distances = cp.array(([99999]*len(df)), dtype=cp.float32) # we setup an array of inf so that the distance of nan feature is not zero, zero means it will be first we will pick the closest points to query
                    indices = cp.zeroes(len(df), dtype=cp.float32)
                    dist_kernel((blockSize,),(gridSize,),(dataset,query,distances,len(val.index),indices,n_neigh))
                    distances = cp.sqrt(distances)
                    print(distances)
                    ind_dist = list()
                    for ind, v in enumerate(distances):
                        ind_dist.append((ind,v))
                    ind_dist = sorted(ind_dist, key=lambda x : x[1])
                    ind = [i[0] for i in ind_dist[:n_neigh]]
                    df[feat_name] = cp.mean(df.iloc[ind][feat_name]) # we replace the nan value
                    break
                break
        # Call the function again here recursively, and exit when there is no more nan in the dataframe
        #KnnImputer(df,5)

In [132]:
%%time
df = KnnImputer(df)

  0%|                                                                             | 2/32424 [00:06<30:07:31,  3.35s/it]


KeyboardInterrupt: 

In [97]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lag_1,lag_2,lag_3,lag_4,...,Lower_band_30,Mid_band_50,Upper_band_50,Lower_band_50,sp500,year,month,day,pct_change,FEDFUNDS
0,0.976004,1.053571,0.973214,0.998884,0.845655,667620800,,,,,...,,,,,1356.560059,2000,4,4,,6.02
1,0.977679,1.106585,0.973772,1.106027,0.936362,409561600,0.998884,,,,...,,,,,1401.439941,2000,4,0,0.107263,6.02
2,1.102679,1.132813,1.065848,1.132813,0.959039,390924800,1.106027,0.998884,,,...,,,,,1441.609985,2000,4,1,0.024218,6.02
3,1.126674,1.162946,1.069196,1.081473,0.915574,520150400,1.132813,1.106027,0.998884,,...,,,,,1427.469971,2000,4,2,-0.045321,6.02
4,1.104353,1.113839,1.045201,1.061384,0.898567,722120000,1.081473,1.132813,1.106027,0.998884,...,,,,,1434.540039,2000,4,3,-0.018576,6.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40011,188.389999,191.080002,181.229996,182.470001,182.470001,118984100,177.289993,170.660004,173.789993,177.479996,...,168.965236,172.4750,193.413033,151.536967,5433.740234,2024,6,3,0.029218,
40012,185.800003,186.000000,176.919998,178.009995,178.009995,82038200,182.470001,177.289993,170.660004,173.789993,...,168.976742,172.6130,193.605196,151.620803,5431.600098,2024,6,4,-0.024442,
40013,177.919998,188.809998,177.000000,187.440002,187.440002,109786100,178.009995,182.470001,177.289993,170.660004,...,168.399472,173.0638,194.346013,151.781587,5473.229980,2024,6,0,0.052975,
40014,186.559998,187.199997,182.369995,184.860001,184.860001,68982300,187.440002,178.009995,182.470001,177.289993,...,168.390194,173.3014,194.843470,151.759329,5487.029785,2024,6,1,-0.013764,


In [21]:
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [48]:
cpt = 0
for i, v in df.iterrows():
    print(v.to_numpy())
    cpt+=1
    if cpt == 2:
        break

[ 9.76004004e-01  1.05357099e+00  9.73213971e-01  9.98884022e-01
  8.45654845e-01  6.67620800e+08             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
  2.37996639e+05 -1.11660093e-03             nan             nan
  0.00000000e+00  9.98884022e-01  9.98884022e-01  0.00000000e+00
  9.98884022e-01  9.98884022e-01  9.98884022e-01  9.98884022e-01
  9.98884022e-01  9.98884022e-01  9.98884022e-01  9.98884022e-01
  9.98884022e-01         

In [43]:
df.shape

(40016, 113)

In [64]:
df.isnull().values.any()

True