In [28]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pandas as pd
import datetime

In [29]:
data=pd.read_parquet('../data/vol_technical_stock_data.parquet')
# data=data.reset_index()
data=data.dropna()

In [30]:
data

Unnamed: 0,PERMNO,date,TICKER,PRC,VOL,RET,NUMTRD,avg_vol_30d,ATR,ROC,...,VWAP,SMA20,STD20,UpperBand,LowerBand,OBV,RSI,PVT,Momentum_TradeCount,EMA_TradeCount
347640,79875,2000-02-02,TOWR,1.15625,22030.0,-0.075000,21.0,0.057618,2.279018,-0.972054,...,28.834509,15.739062,20.915123,57.569308,-26.091183,2.612292e+07,0.000000,-8.575068e+02,-793.0,616.895888
399849,81691,2000-02-02,COMT,3.50000,5500.0,-0.034483,5.0,0.051565,2.198661,-0.705263,...,28.833312,11.460938,11.962278,35.385494,-12.463619,2.611742e+07,0.000000,2.101179e+02,-233.0,533.538440
287867,77610,2000-02-02,PANL,16.75000,221384.0,0.107438,516.0,0.081178,2.341518,-0.423656,...,28.785009,12.079688,12.313533,36.706754,-12.547379,2.616780e+07,100.000000,3.102140e+04,187.0,531.243289
543703,85531,2000-02-02,BFAM,17.25000,14571.0,0.014706,35.0,0.046988,2.368304,3.758621,...,28.783541,12.807813,12.158502,37.124817,-11.509192,2.618237e+07,100.000000,4.181443e+01,-110.0,468.536867
542436,85521,2000-02-02,AFCO,15.12500,26232.0,0.004149,40.0,0.067580,1.368304,4.902439,...,28.782521,13.906250,12.709571,39.325393,-11.512893,2.620660e+07,100.000000,1.689747e+02,-1184.0,416.020509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982259,92433,2016-12-30,ETRM,2.00000,424095.0,-0.180328,1232.0,0.214319,0.551264,-0.918534,...,7.926654,7.019680,9.406827,25.833334,-11.793974,1.120013e+10,79.017284,4.537801e+07,-1517.0,1505.126221
625121,87074,2016-12-30,PRZM,0.30000,23643.0,0.034483,56.0,0.245188,0.610171,-0.850000,...,7.926675,8.133925,10.818581,29.771086,-13.503236,1.120098e+10,54.035618,3.462018e+06,-438.0,1367.114200
698005,88177,2016-12-30,APRI,1.30000,72133.0,0.015625,211.0,0.124776,0.482314,-0.957655,...,7.926674,8.076925,10.853107,29.783140,-13.629290,1.120106e+10,50.152898,5.275897e+05,-194.0,1257.008086
625212,87077,2016-12-30,ECPG,28.65000,263390.0,-0.030457,2161.0,0.022872,0.528521,94.500000,...,7.926680,9.394425,11.682581,32.759587,-13.970737,1.120079e+10,59.573544,7.620320e+03,-304.0,1343.102554


In [32]:
data=data[["TICKER","date","RET","PVT","RSI","MACD","VWAP"]]

In [33]:
class StockAutoencoder(nn.Module):
    def __init__(self, input_size, embedding_dim=32):
        super(StockAutoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(True),
            nn.Linear(64, embedding_dim),
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, input_size),
            nn.Tanh(),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def train_autoencoder(self, features, num_epochs=100, batch_size=64, learning_rate=0.001):
        # Data normalization
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        # Create a DataLoader
        features_tensor = torch.tensor(scaled_features, dtype=torch.float32)
        dataset = TensorDataset(features_tensor, features_tensor)  # Using features as both input and target
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        self.train()  # Set the model to training mode
        for epoch in range(num_epochs):
            for data in dataloader:
                inputs, targets = data
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
            
        return scaler  # Return the scaler for inverse transformation later

    def encode(self, features, scaler):
        # Apply the same scaling as during training
        scaled_features = scaler.transform(features)
        features_tensor = torch.tensor(scaled_features, dtype=torch.float32)
        
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            embeddings = self.encoder(features_tensor)
        return embeddings.numpy()  # Convert embeddings to NumPy array for easier handling

In [37]:
def process_with_autoencoder(window, input_features, target_feature, embedding_dim=2, num_epochs=20, batch_size=64):
    # Assuming 'df' contains your window's data with input features and a target feature
    
    # Initialize the Autoencoder
    input_size = len(input_features)
    autoencoder = StockAutoencoder(input_size=input_size, embedding_dim=embedding_dim)
    
    # Train the autoencoder
    features_df = window[input_features]
    scaler = autoencoder.train_autoencoder(features_df.values, num_epochs=num_epochs, batch_size=batch_size)
    
    # Generate embeddings
    embeddings = autoencoder.encode(features_df.values, scaler)

    return embeddings

    # model = LinearRegression().fit(embeddings, window[target_feature])
    # # Calculate residuals
    # return window[target_feature] - model.predict(embeddings)

In [38]:
data.columns

Index(['TICKER', 'date', 'RET', 'PVT', 'RSI', 'MACD'], dtype='object')

In [39]:
# Assuming 'df' is your DataFrame

# Define the target feature
target_feature = 'RET'

# Create a list of input features, excluding 'date', 'TICKER', and the target feature
input_features = [col for col in data.columns if col not in ['date', 'TICKER','PERMNO']]

embeddings=process_with_autoencoder(data,input_features,target_feature)

Epoch 1/20, Loss: 1.1130746603012085
Epoch 2/20, Loss: 1.1076478958129883
Epoch 3/20, Loss: 1.1033848524093628


KeyboardInterrupt: 

In [22]:
embeddings_df = pd.DataFrame(embeddings, columns=[f'embedding_{i}' for i in range(embeddings.shape[1])])


In [23]:
embeddings_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7
0,-2.560893,-0.664150,16.695799,-6.786082,-9.129557,-2.903058,-3.059472,6.138168
1,-10.329928,-7.462474,12.550125,8.565786,-10.872540,-4.394144,-5.568893,1.716774
2,6.644902,-7.218876,15.052278,-9.623850,-8.038305,-12.562415,-4.155279,9.330781
3,-12.204347,-6.073637,12.864233,8.819901,-11.035395,-2.494218,-5.619483,1.133118
4,-9.508071,-7.993865,12.044493,8.546692,-10.214256,-4.936547,-5.385269,1.808638
...,...,...,...,...,...,...,...,...
864228,-1.133893,-1.262910,3.212672,-0.502286,1.007025,-0.645206,0.791726,-1.982400
864229,-0.950478,-1.196084,3.350212,-0.979950,0.975326,-0.854027,0.847902,-1.907466
864230,-1.087839,-1.476403,2.764662,-0.452168,1.579623,-0.650186,0.862024,-2.397185
864231,-0.059067,-1.337365,3.246353,-1.221088,0.954836,0.293911,1.720680,-1.637758


In [24]:
non_feature_cols = ['date', 'TICKER','RET']
non_features_df = data[non_feature_cols].reset_index(drop=True)
non_features_df

Unnamed: 0,date,TICKER,RET
0,2000-02-02,TLAB,0.011161
1,2000-02-02,GBLX,-0.040791
2,2000-02-02,GSPN,0.373851
3,2000-02-02,PSFT,-0.076739
4,2000-02-02,CMRC,-0.024566
...,...,...,...
864228,2016-12-30,IEP,-0.004816
864229,2016-12-30,RYAAY,0.000000
864230,2016-12-30,TFSL,-0.002619
864231,2016-12-30,UHAL,-0.002079


In [25]:
final_df = pd.concat([non_features_df, embeddings_df], axis=1)
final_df

Unnamed: 0,date,TICKER,RET,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7
0,2000-02-02,TLAB,0.011161,-2.560893,-0.664150,16.695799,-6.786082,-9.129557,-2.903058,-3.059472,6.138168
1,2000-02-02,GBLX,-0.040791,-10.329928,-7.462474,12.550125,8.565786,-10.872540,-4.394144,-5.568893,1.716774
2,2000-02-02,GSPN,0.373851,6.644902,-7.218876,15.052278,-9.623850,-8.038305,-12.562415,-4.155279,9.330781
3,2000-02-02,PSFT,-0.076739,-12.204347,-6.073637,12.864233,8.819901,-11.035395,-2.494218,-5.619483,1.133118
4,2000-02-02,CMRC,-0.024566,-9.508071,-7.993865,12.044493,8.546692,-10.214256,-4.936547,-5.385269,1.808638
...,...,...,...,...,...,...,...,...,...,...,...
864228,2016-12-30,IEP,-0.004816,-1.133893,-1.262910,3.212672,-0.502286,1.007025,-0.645206,0.791726,-1.982400
864229,2016-12-30,RYAAY,0.000000,-0.950478,-1.196084,3.350212,-0.979950,0.975326,-0.854027,0.847902,-1.907466
864230,2016-12-30,TFSL,-0.002619,-1.087839,-1.476403,2.764662,-0.452168,1.579623,-0.650186,0.862024,-2.397185
864231,2016-12-30,UHAL,-0.002079,-0.059067,-1.337365,3.246353,-1.221088,0.954836,0.293911,1.720680,-1.637758


In [26]:
final_df.columns

Index(['date', 'TICKER', 'RET', 'embedding_0', 'embedding_1', 'embedding_2',
       'embedding_3', 'embedding_4', 'embedding_5', 'embedding_6',
       'embedding_7'],
      dtype='object')

In [27]:
final_df.to_parquet('../data/numtrd_stock_w_autoencoders_expanded.parquet')