In [None]:
!pip install -U -q PyDrive
!pip install pytorch_lightning


from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

from IPython import display
import pytorch_lightning as pl
import torch
from torch import nn, optim
from torch import distributions as D
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import DataLoader, Dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.8.4.post0-py3-none-any.whl (800 kB)
[K     |████████████████████████████████| 800 kB 4.2 MB/s 
[?25hCollecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 12.5 MB/s 
Collecting lightning-utilities!=0.4.0,>=0.3.0
  Downloading lightning_utilities-0.4.2-py3-none-any.whl (16 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 51.4 MB/s 
Installing collected packages: torchmetrics, tensorboardX, lightning-utilities, pytorch-lightning
Successfully installed lightning-utilities-0.4.2 pytorch-lightning-1.8.4.post0 tensorboardX-2.5.1 torchmetrics-0.11.0


In [None]:
def removeInf(df):
    for col in df.columns[3:]:
      df["test"] = np.isinf(np.array(df.loc[:, col]))
      df = df.loc[df["test"] != True]
      df = df.drop(columns = "test")
    return df

def removeNaN(df):
    return df.dropna()

def getLogTransData(data, x):
  data[x] = np.log(data[x])
  return data

def getOutlierFree(df, conf, y):
  # conf = 0.25
  left = df[y].quantile(q=conf/100)
  right = df[y].quantile(q=1-conf/100)
  return df.loc[(df[y] > left) & (df[y] < right)].dropna()

def getEncoded(df, col):

  #creating instance of one-hot-encoder
  encoder = OneHotEncoder(handle_unknown='ignore')

  #perform one-hot encoding on 'team' column
  encoder_df = pd.DataFrame(encoder.fit_transform(df[[col]]).toarray())
  encoder_df.columns = np.sort(df[col].unique())
  
  col = encoder_df.columns
  df = df.reset_index(drop = True)
  #merge one-hot encoded columns back with original DataFrame
  return df.join(encoder_df), col

In [None]:
class dataProcess():
  def __init__(self, loc, fileName, predWindow, y):
      self.location = loc
      self.fileName = fileName
      self.predWindow = predWindow
      self.y = y

  def downloadData(self):
    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    link = self.location
   # to get the id part of the file
    id = link.split("/")[-2]
    downloaded = drive.CreateFile({'id':id})
    downloaded.GetContentFile(fileName)
    transDF = pd.read_csv(fileName)
    return transDF
  
  def processData(self):
    transDF = self.downloadData()
    transDF = transDF.iloc[:, 1:].loc[(transDF["Emission"]>=0) & (transDF["Revenue"]>0)]
    transDF["E2R"] = 100*(transDF["Emission"] / transDF["Revenue"])
    transDF["Net Income"] = 100*(transDF["Net Income"] / transDF["Revenue"])
    transDF["Market Cap"] = 100*(transDF["Market Cap"] / transDF["Revenue"])
    transDF["Cash Flow"] = 100*(transDF["Cash Flow"] / transDF["Revenue"])
    transDF = transDF.drop(columns = ["Revenue", "Emission"], axis = 1)
    transDF = removeInf(getLogTransData(transDF, "Market Cap")).dropna()
    return transDF
  
  ##new algo for getting level data
  def getLevelData(self):
    df = self.processData()
    df = df.sort_values(by=['Company', 'Year'])
    if self.predWindow == 0:
      return df.dropna()
    else:
      df[self.y + " Shift"] = df[self.y].shift(-self.predWindow)

      ## Dropping the data in the prediction window
      erase = list(np.sort(df["Year"].unique()))[-1-self.predWindow:-1][0]
      df = df.loc[df["Year"] <= erase]
    return getOutlierFree(df, 0.25, y)

class trainTestSplit(dataProcess):
  def __init__(self, model, yNew, X, catVar, loc, fileName, predWindow, y):
      super(trainTestSplit, self).__init__(loc, fileName, predWindow, y)
      self.model = model
      self.yNew = yNew
      self.X = X
      self.catVar = catVar
  
  def getTrainTestData(self):
    df = self.getLevelData()
    df = df.loc[df[self.yNew] != 0]
    
    if self.model == "NN":
      dataDict = {}
      dataDict["XTrainCont"] = df[self.X].loc[df["Year"] < max(df["Year"])].dropna().reset_index(drop = True)
      dataDict["XTestCont"] = df[self.X].loc[df["Year"] == max(df["Year"])].dropna().reset_index(drop = True)
      dataDict["XTrainCat"] = df[self.catVar].loc[df["Year"] < max(df["Year"])].dropna()
      dataDict["XTestCat"] = df[self.catVar].loc[df["Year"] == max(df["Year"])].dropna()
      dataDict["yTrain"] = np.log(df[self.yNew].loc[df["Year"] < max(df["Year"])].dropna())
      dataDict["yTest"] = np.log(df[self.yNew].loc[df["Year"] == max(df["Year"])].dropna())
      return dataDict
    
    elif self.model == "RF":
      df, newCol = getEncoded(df, self.catVar)[0], getEncoded(df, self.catVar)[1] 
      dataDict = {}
      XVar = self.X + list(newCol)
      dataDict["XTrain"] = df[XVar].loc[df["Year"] < max(df["Year"])].dropna().reset_index(drop = True)
      dataDict["XTest"] = df[XVar].loc[df["Year"] == max(df["Year"])].dropna().reset_index(drop = True)
      dataDict["yTrain"] = np.log(df[self.yNew].loc[df["Year"] < max(df["Year"])].dropna())
      dataDict["yTest"] = np.log(df[self.yNew].loc[df["Year"] == max(df["Year"])].dropna())
      return dataDict

In [None]:
class RFReg():
  def __init__(self, nEst, criterion, XTrain, yTrain, XTest, yTest):
      self.nEst = nEst
      self.criterion = criterion
      self.XTrain, self.XTest = XTrain, XTest
      self.yTrain, self.yTest = yTrain, yTest
  
  def trainRF(self):
    # criterion{“squared_error”, “absolute_error”, “poisson”}
    # create regressor object

    self.regressor1 = RandomForestRegressor(n_estimators = self.nEst, random_state = 0, criterion=self.criterion)
    model = self.regressor1.fit(self.XTrain, self.yTrain)
    yFit = model.predict(self.XTrain)
    variance = np.square(yFit - self.yTrain)
    self.regressor2 = RandomForestRegressor(n_estimators = self.nEst, random_state = 42, criterion=self.criterion)
    modelVar = self.regressor2.fit(self.XTrain, variance)
    return model, modelVar

In [None]:
class embedCategory():
  def __init__(self, trainX, layerCnt):
      self.trainX = trainX
      self.layerCnt = layerCnt

  def getIndDict(self):
    industryCategoryDict = {}
    j = 0
    for ind in np.unique(self.trainX):
      industryCategoryDict[ind] = j
      j = j + 1
    return industryCategoryDict

  def embedLayer(self):
    indDict = self.getIndDict()
    embed_layers = nn.ModuleDict({
            "Industry": nn.Embedding(max(indDict.values()) + 1, self.layerCnt)
        })
    return embed_layers

class ConditionalNet(torch.nn.Module):
    def __init__(self, lenX, layerCounts, hidden_size):
        super().__init__()
        self.lenX = lenX
        self.layerCounts = layerCounts
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(self.lenX+self.layerCounts, hidden_size),# we increased this to account for the embedding output
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2),
        )

    def forward(self, x):
        return self.layers(x)


class ConditionalModel(pl.LightningModule):
    def __init__(self, condNet, embedLayers, margin):
        super().__init__()
        self.condNet = condNet
        self.embed_layers = embedLayers
        self.margin = margin

    def training_step(self, batch, batch_idx):
        x, categorical_map, y = batch
        #print('here in training step with',x.shape,y.shape,categorical_map)
        cond_emb = torch.stack([self.embed_layers[k](v) for k, v in categorical_map.items()],-1).sum(-1).squeeze(-2)
        # print(cond_emb.shape)
        # print(x.shape)
        full_cond = torch.concat((x.squeeze(),cond_emb),-1)
        # print(full_cond.shape)
        dist_params = self.condNet(full_cond)
        dist_params = dist_params.chunk(2, -1)
        _mus = dist_params[0]
        _stds = torch.nn.Softplus()(dist_params[1]) + self.margin
        return -D.Normal(_mus, _stds).log_prob(y).mean()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.001) ## hyperparameterize 
        return optimizer

class CondDataset(Dataset):
    def __init__(self, e, industry, indDict, y):
        self.e = e
        self.ind = industry
        self.indDict = indDict
        self.y = y
        return

    def __getitem__(self, idx):
        #print('here with idx',idx,'and vals',self.e[idx],self.y[idx],self.color[idx],self.brand[idx])
        categorical_map = {
            'Industry':torch.Tensor(np.array([self.indDict[self.ind[idx]]])).long().squeeze()
        }
        return (torch.Tensor(np.array([self.e[idx]])), categorical_map, torch.Tensor(np.array([self.y[idx]])))
        
    def __len__(self):  
        return len(self.y)

In [None]:
def dataLoad(contVar, catVar, catDict, y):
  dataloader = DataLoader(
    CondDataset(contVar, catVar, catDict, y),
    batch_size=256,
    shuffle=True,
    drop_last=True,
  )
  return dataloader

def trainModel(lenX, layerCounts, hidden_size, embed_layers, margin, data):
  trainer = pl.Trainer(
      max_epochs=1500,
      accelerator="cpu",
      num_sanity_val_steps=0
  )
  model = ConditionalModel(ConditionalNet(lenX, layerCounts, hidden_size), embed_layers, margin)
  trainer.fit(model, data)
  model.eval()
  return model.state_dict() #trainer.fit(model, data)

def NLLCalc():
  losses = []
  cdfs = []
  CatXTest = np.array(trainTestDict["XTestCat"].dropna().reset_index(drop = True))
  ContXTest = np.array(trainTestDict["XTestCont"].dropna().reset_index(drop = True))
  indDictTest = embedCategory(CatXTest, layerCounts).getIndDict()
  yTest = np.array(trainTestDict["yTest"].dropna().reset_index(drop = True))
  embed_layers = embedCategory(CatXTest, layerCounts).embedLayer()
  model = ConditionalModel(ConditionalNet(lenX, layerCounts, hidden_size), embed_layers, margin)
  with torch.no_grad():
      dataloader2 = DataLoader(
          CondDataset(ContXTest, CatXTest, indDictTest, yTest),
          batch_size=256,
          shuffle=False,
          drop_last=False,
      )
      for batch in dataloader2:
          x, categorical_map, y = batch
          cond_emb = torch.stack([model.embed_layers[k](v) for k, v in categorical_map.items()],-1).sum(-1).squeeze(-2)
          #print(cond_emb.shape)
          full_cond = torch.concat((x.squeeze(),cond_emb),-1)
          #print(full_cond.shape)
          dist_params = model.condNet(full_cond)
          dist_params = dist_params.chunk(2, -1)
          _mus = dist_params[0]
          _stds = torch.nn.Softplus()(dist_params[1]) + margin
          dists = D.Normal(_mus, _stds)
          losses.extend(list(dists.log_prob(y).cpu().detach().numpy()))
          cdfs.extend(list(dists.cdf(y).cpu().detach().numpy()))
  losses = np.array(losses)
  cdfs = np.array(cdfs)
  return np.mean(-losses), cdfs

In [None]:
## Initialization
loc = 'https://drive.google.com/file/d/1TQodZLpXVgAmaP5XZe8lYImlLaJ68yWb/view?usp=share_link'
fileName = 'data_prep1.csv'
predWindow = 1

y = "E2R"
# dataClass = dataProcess(loc, fileName, predWindow, y)

yNew = "E2R Shift"
X = ['Net Income','Market Cap','Cash Flow','E2R']
catVar = "Industry"

In [None]:
## Random Forrest Regressor
trainTestClass = trainTestSplit("RF", yNew, X, catVar, loc, fileName, predWindow, y)
trainTestDict = trainTestClass.getTrainTestData()
XTrain, yTrain, XTest, yTest = trainTestDict["XTrain"], trainTestDict["yTrain"], trainTestDict["XTest"], trainTestDict["yTest"]

nEst = 100
# absolute_error”, “friedman_mse”, “poisson”
criterion = 'squared_error' #'squared_error'
modelRF = RFReg(nEst, criterion, XTrain, yTrain, XTest, yTest)
muModel, sigmaModel = modelRF.trainRF()

In [None]:
muModel.score(trainTestDict["XTest"], trainTestDict["yTest"])

0.943374797265357

In [None]:
varPred = np.square(trainTestDict["yTest"] - muModel.predict(trainTestDict["XTest"]))
sigmaModel.score(trainTestDict["XTest"], varPred)

-0.013297279213491775

In [None]:
## Neural Net Model Execution
trainTestClass = trainTestSplit("NN", yNew, X, catVar, loc, fileName, predWindow, y)
trainTestDict = trainTestClass.getTrainTestData()
catVar = np.array(trainTestDict["XTrainCat"].dropna().reset_index(drop = True))
contVar = np.array(trainTestDict["XTrainCont"].dropna().reset_index(drop = True))
lenX = len(X)

lc = [16]  #[16, 32, 48]
mr = [1e-2]  #[1e-2, 1e-4, 1e-6]
hs = [128]  #[64, 128, 256]

op = pd.DataFrame(columns = ["Layers", "Margin", "Hidden_States", "NLL"])
for l in lc:
  for m in mr:
    for h in hs:
      layerCounts = l
      catDict = embedCategory(catVar, layerCounts).getIndDict()
      embed_layers = embedCategory(catVar, layerCounts).embedLayer()
      
      margin = m
      hidden_size = h
      y = np.array(trainTestDict["yTrain"].dropna().reset_index(drop = True))

      dataloader = dataLoad(contVar, catVar, catDict, y)
      model = trainModel(lenX, layerCounts, hidden_size, embed_layers, margin, dataloader)
      import pickle

      with open('modelNN.pickle', 'wb') as f:
        pickle.dump(model, f)
      NLL = NLLCalc()[0]
      op.loc[len(op) + 1] = [l, m, h, NLL]
      # op.to_csv("modelNLL.csv")
      # files.download(r"modelNLL.csv"

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name         | Type           | Params
------------------------------------------------
0 | condNet      | ConditionalNet | 19.5 K
1 | embed_layers | ModuleDict     | 3.3 K 
------------------------------------------------
22.8 K    Trainable params
0         Non-trainable params
22.8 K    Total params
0.091     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [None]:
indDict = embedCategory(catVar, layerCounts).getIndDict()
with open('indDict.pickle', 'wb') as f:
        pickle.dump(indDict, f)

In [None]:
## Interface code

In [None]:
import pickle
import numpy as np
modelFile = open('modelNN.pickle', 'rb')
stateDict = pickle.load(modelFile)
indDictFile = open('indDict.pickle', 'rb')
indDict = pickle.load(indDictFile)

In [None]:
len(indDict)

In [None]:
# User Input
NetIncome = 300
Revenue = 900
MarketCap = 9000
CashFlow = -350
Emission = 70
Industry = "Waste Management"

In [None]:
## calculate as part of the interface code
E2R = np.log(100*(Emission/Revenue))
NI2R = 100*(NetIncome/Revenue)
MC2R = 100*(MarketCap/Revenue)
CF2R = 100*(CashFlow/Revenue)
CatX = np.array(Industry)
ContX = np.array([NI2R, MC2R, CF2R, E2R])

In [None]:
# catDict = embedCategory(indDict, 16).getIndDict()
embed_layers = nn.ModuleDict({
            "Industry": nn.Embedding(max(indDict.values()) + 1, 16)
        })
# embed_layers = embedCategory(indDict, 16).embedLayer()
model = ConditionalModel(ConditionalNet(4, 16, 128), embed_layers, 1e-2)

In [None]:
type(embed_layers)

In [None]:
model.load_state_dict(stateDict)
model.eval()

In [None]:
with torch.no_grad():

  x = torch.Tensor(ContX)
  # 'Industry':torch.Tensor(np.array([self.indDict[self.ind[idx]]])).long().squeeze()
  categorical_map = {
              'Industry':torch.Tensor(np.array([indDict[Industry]])).long()#.squeeze()
          }
  cond_emb = torch.stack([model.embed_layers[k](v) for k, v in categorical_map.items()],-1).sum(-1).squeeze(-2)
  #print(cond_emb.shape)
  full_cond = torch.concat((x.squeeze(),cond_emb),-1)
  #print(full_cond.shape)
  dist_params = model.condNet(full_cond)
  dist_params = dist_params.chunk(2, -1)
  _mus = dist_params[0]
  _stds = torch.nn.Softplus()(dist_params[1]) + margin

In [None]:
_mus, _stds.item()