In [1]:
import torch
import pandas as pd
import numpy as np

# Import dataset utils
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from pytorch_lightning.trainer import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import LightningModule


import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

import importlib
if importlib.util.find_spec('ipywidgets') is not None:
    from tqdm.auto import tqdm
else:
    from tqdm import tqdm

In [2]:
dataframe = pd.read_csv('../data/final.csv', sep=';')
dataframe.head()

Unnamed: 0,sexo,Estado_civil,Status_empl,Licenca,Tipo_Resid,Residencia,Alcoolatra,Droga,Suic_familia,Dep_familia,...,Eixo I: Panico sem agorafobia,Eixo I: Fobia especifica,Eixo I: Fobia social,Eixo I: Obsessivo-compulsivo,Eixo I: Estresse pos-traumatico,Eixo I: Ansiedade generalizada,Eixo II: Personalidade paranoica,Eixo II: Transtorno de personalidade,TOC,idade
0,M,3.0,,0.0,3.0,1.0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,40.0
1,F,1.0,3.0,0.0,4.0,3.0,0,0,0,1,...,,,,,,,,,0.0,20.0
2,F,1.0,2.0,0.0,1.0,2.0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0
3,F,1.0,3.0,0.0,1.0,3.0,0,0,0,1,...,,,,,,,,,6.0,30.0
4,F,4.0,2.0,0.0,1.0,,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0


In [3]:
# replace all nans with -1
dataframe = dataframe.fillna(-1)

# drop Chave
dataframe = dataframe.drop(['Chave'], axis=1)

dataframe['sexo'].replace({'M': 0, 'F': 1}, inplace=True)


In [4]:
df_suic = dataframe.copy()

df_suic = df_suic.astype(float)

df_suic.shape



(3953, 68)

In [5]:
# turn -5 to -1
df_suic['Anos educacao formal'] = df_suic['Anos educacao formal'].replace(-5, -1)

In [6]:
# find min and max values for each column
min_max = {}
for col in df_suic.columns:
    min_max[col] = (df_suic[col].min(), df_suic[col].max())
# preety print
for k, v in min_max.items():
    print(f'{k}: {v}')
        

sexo: (-1.0, 1.0)
Estado_civil: (-1.0, 6.0)
Status_empl: (-1.0, 6.0)
Licenca: (-1.0, 1.0)
Tipo_Resid: (-1.0, 8.0)
Residencia: (-1.0, 3.0)
Alcoolatra: (0.0, 1.0)
Droga: (0.0, 1.0)
Suic_familia: (0.0, 1.0)
Dep_familia: (0.0, 1.0)
Bip_familia: (0.0, 1.0)
Alc_familia: (0.0, 1.0)
Drog_familia: (0.0, 1.0)
coracao: (0.0, 4.0)
vascular: (0.0, 3.0)
hematopoetico: (0.0, 4.0)
Olho_ore_nariz_garg_lar: (0.0, 4.0)
GI_sup: (0.0, 4.0)
Gi_inf: (0.0, 4.0)
Renal: (0.0, 4.0)
Genito_urinario: (0.0, 4.0)
Musculoesqueletico: (0.0, 4.0)
Neuro: (0.0, 4.0)
psiquiatrica: (0.0, 4.0)
Respiratorio: (0.0, 4.0)
Figado: (0.0, 4.0)
Endocrino_metabolico: (0.0, 4.0)
Anos educacao formal: (-1.0, 27.0)
Capaz de desfrutar das coisas: (-1.0, 5.0)
Impacto de sua familia e amigos: (-1.0, 7.0)
Numero de amigos vivendo com paciente: (-1.0, 89.0)
Capaz de tomar decisões importantes: (-1.0, 5.0)
Numero de parentes vivendo com paciente: (-1.0, 11.0)
Conjuge_companheiro vive com paciente: (-1.0, 1.0)
Estudante: (-1.0, 1.0)
Numero to

In [7]:
class MyDataset(Dataset):
 
  def __init__(self, input_dataframe, split="train", target=["Suicidio", "Ansiedade"], ignore_columns=[], train_ratio=0.8):
    
    self.split = split
    self.target = target
    self.ignore_columns = ignore_columns

    for coll in self.ignore_columns:
       if coll in input_dataframe.columns:
        input_dataframe = input_dataframe.drop(coll, axis=1)

    # self.classification_dim = len(input_dataframe[self.target].unique())
    self.data_dim = len(input_dataframe.columns) - len(target) - len(ignore_columns)
    self.embbeding_dim = input_dataframe.max().max() + 1

    y = input_dataframe[target].values
    x = input_dataframe.drop(target, axis = 1).values

    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=42)

  def __len__(self):
    if self.split == "train":
      return len(self.x_train)
    elif self.split == "test":
      return len(self.x_test)
    else:
      raise ValueError("Split must be train or test")

  def get_weights(self):
    if self.split == "train":
      y = self.y_train
    elif self.split == "test":
      y = self.y_test

    weights = []
    for i in range(len(self.target)):
      weights_dict = pd.DataFrame(y[:,i]).value_counts(normalize=True).to_dict()
      keys = sorted([k for k in weights_dict.keys()], key=lambda x: x[0])
      weights.append(np.array([1/weights_dict[k] for k in keys]))
    self.weights = np.hstack(weights).T

    return self.weights

  def __getitem__(self,idx):
    # target = torch.zeros(self.classification_dim)

    if self.split == "train":
      # target[self.y_train[idx]] = 1
      target = torch.tensor(self.y_train[idx], dtype=torch.float)
      return (torch.tensor(self.x_train[idx], dtype=torch.float), target)
    elif self.split == "test":
      # target[self.y_test[idx]] = 1
      target = torch.tensor(self.y_test[idx], dtype=torch.float)
      return (torch.tensor(self.x_test[idx], dtype=torch.float), target)
    else:
      raise ValueError("Split must be train or test")

In [8]:
# Make embbeding layer with one different embbeding for each column and acount for the -1 values
class MyEmbbeding(nn.Module):
    def __init__(self, dataframe, fake=False):
        super(MyEmbbeding, self).__init__()
        self.embbedings = nn.ModuleList()
        self.fake = fake
        # create embbeding for each column in order
        for col in dataframe.columns:
            self.embbedings.append(nn.Embedding(int(dataframe[col].max()+10), 1))


    def forward(self, x):
        if self.fake:
            return x

        embbedings = []
        assert x.shape[1] == len(self.embbedings), f"Input shape {x.shape} must be equal to number of embbedings {len(self.embbedings)}"
        for i, embbeding in enumerate(self.embbedings):
            # print embbeding max label
            embbedings.append(embbeding(x[:,i].long()+1))
        return torch.cat(embbedings, dim=1)

In [22]:
# Create classification model using the embbeding layer and output a one hot vector with the classification
class NNModel(nn.Module):
    def __init__(self, embbeding, hidden_dim=64, output_dim=1):
        super(NNModel, self).__init__()
        self.embbeding = embbeding
        self.fc1 = nn.Linear(embbeding.embbedings[0].embedding_dim*len(embbeding.embbedings), hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    # regressive output
    def forward(self, x):
        x = self.embbeding(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Training 

In [23]:
test_dataset = MyDataset(df_suic, "test", target=[])
train_dataset = MyDataset(df_suic, "train", target=[])

test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [24]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 3162
Test dataset size: 791


In [32]:
# make train loop on cpu to reconstruct the model
embbeding = MyEmbbeding(df_suic, fake=False)

model = NNModel(embbeding, output_dim=len(df_suic.columns))

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 1000
for epoch in range(epochs):
    print(f"Epoch {epoch}")
    mean_loss = 0
    acc = []
    for i, batch in enumerate(train_dataloader):
        x, _ = batch
        y = x
        res = model(x)

        # compute loss and backprop using scheduler
        loss = criterion(res, y)
        loss.backward()
        # scheduler.step(loss)
        optimizer.step()
        optimizer.zero_grad()

        # compute accuracy for each dimension
        acc.append([(res[:,i].round() == y[:,i]).sum().item()/len(y) for i in range(y.shape[1])] )

        mean_loss += loss
    mean_loss /= len(train_dataloader)
    acc = np.mean(acc, axis=0)

    print(f"[{epoch}]Train loss {mean_loss}, acc {acc.mean()}")

    # stop at overfitting
    if mean_loss < 10:
        break

    with torch.no_grad():
        mean_loss = 0
        acc = []
        for i, batch in enumerate(test_dataloader):
            x, _ = batch
            y = x

            res = model(x)

            # compute loss and backprop using scheduler
            loss = criterion(res, y)

            # compute accuracy for each dimension
            acc.append([(res[:,i].round() == y[:,i]).sum().item()/len(y) for i in range(y.shape[1])] )

            mean_loss += loss
        mean_loss /= len(test_dataloader)
        acc = np.mean(acc, axis=0)

        print(f"[{epoch}]Test loss {mean_loss}, acc {acc.mean()}")


Epoch 0
[0]Train loss 132416.15625, acc 0.4642329688285571
[0]Test loss 99757.296875, acc 0.4873521419437341
Epoch 1
[1]Train loss 109626.0078125, acc 0.44282902554961384
[1]Test loss 80379.0078125, acc 0.49748561381074163
Epoch 2
[2]Train loss 101628.734375, acc 0.4767985282691165
[2]Test loss 79003.1171875, acc 0.47153852301790283
Epoch 3
[3]Train loss 98682.3515625, acc 0.4656866315873669
[3]Test loss 75223.84375, acc 0.45780850383631716
Epoch 4
[4]Train loss 95153.4140625, acc 0.4804250079985374
[4]Test loss 72132.2890625, acc 0.48916320332480817
Epoch 5
[5]Train loss 89695.8671875, acc 0.47847036541889487
[5]Test loss 66738.8203125, acc 0.4754627557544757
Epoch 6
[6]Train loss 83050.140625, acc 0.4748624537227478
[6]Test loss 61833.4609375, acc 0.5100999040920716
Epoch 7
[7]Train loss 76389.75, acc 0.48886167957859145
[7]Test loss 56766.62890625, acc 0.44643142583120204
Epoch 8
[8]Train loss 71470.3359375, acc 0.48776687977055627
[8]Test loss 54516.30078125, acc 0.507876438618926


In [33]:
# save model
torch.save(model.state_dict(), "model.pth")

# save embbeding
torch.save(embbeding.state_dict(), "embbeding.pth")

In [34]:
# use trained embbeding to transform train and test data

new_data = []
for batch in train_dataloader:
    x, y = batch
    transformed = embbeding(x)

    new_data.append(transformed.detach().numpy())

new_data = np.concatenate(new_data, axis=0)

# put new data in dataframe with same columns as original data
new_df = pd.DataFrame(new_data, columns=df_suic.columns)

# save new data
new_df.to_csv("new_data.csv", index=False)


# Graph Discovery

In [None]:
# import causalnex.structure.notears as notears
# import networkx as nx
# import matplotlib.pyplot as plt

In [None]:
# sm = notears.from_pandas(df_suic, max_iter=1000)


# ths = [0.1, 0.15, 0.2, 0.3, 0.4, 0.5]
# fig, axs = plt.subplots(2, 3, figsize=(30, 20))
# for i, ax in enumerate(axs.flatten()):
#     if i > len(ths) - 1:
#         # Clean up empty axes
#         fig.delaxes(ax)
#         continue

#     th = ths[i]
#     sm.remove_edges_below_threshold(th)

#     # Draw only nodes with edges
#     sm2 = sm.edge_subgraph(sm.edges)
#     labels = {node: node for node in sm2.nodes}
#     nx.draw(sm, ax=ax, with_labels=True, nodelist=sm2.nodes, labels=labels, node_size=100, font_size=10)

#     # Add rec around ax to make it easier to see
#     axis = ax.axis()
#     rec = plt.Rectangle((axis[0], axis[2]), (axis[1] - axis[0]), (axis[3] - axis[2]), fill=False, lw=4, linestyle="dotted")
#     ax.add_patch(rec)
#     ax.set_title(f"Threshold: {th}")



# fig.show()