In [16]:
import pytorch_lightning as pl
import torch.utils.data as torch_data
import torch.nn as nn
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.optim
from pytorch_lightning.loggers import CSVLogger

In [66]:
class PandasDataset(torch_data.Dataset):
    features: pd.DataFrame
    targets: pd.DataFrame

    def __init__(self, features: pd.DataFrame, targets: pd.DataFrame):
        self.features = features
        self.targets = targets

    def __getitem__(self, index):
        x, y =  torch.from_numpy(self.features.iloc[index].to_numpy()), torch.from_numpy(self.targets.iloc[index].to_numpy())
        return x,y

    def __len__(self):
        return len(self.features)


class ElectionDataModule(pl.LightningDataModule):
    batch_size: int

    stratification_columns: list[str]
    feature_columns: list[str]
    target_columns: list[str]

    data_train: PandasDataset
    data_test: PandasDataset
    data_val: PandasDataset

    def __init__(self,stratification_columns: list[str],feature_columns: list[str],target_columns: list[str],batch_size=32):
        super().__init__()
        self.stratification_columns = stratification_columns
        self.target_columns = target_columns
        self.feature_columns = feature_columns
        self.batch_size = batch_size
        
    def setup(self, stage):
        df = pd.read_csv("./data/volby/dataset_extended.csv", sep=';')

        aux_indices, test_indices = train_test_split(df.index, train_size= 3/20, stratify = df[self.stratification_columns])
        train_indices, val_indices = train_test_split(aux_indices, train_size= 3/17, stratify=df[self.stratification_columns].iloc[aux_indices])
        scaler = StandardScaler()
        scaler = scaler.fit(df[self.feature_columns].iloc[train_indices])
        scale = lambda x: pd.DataFrame(scaler.transform(x))
        
        self.data_test = PandasDataset(scale(df[self.feature_columns].iloc[train_indices]),df[self.target_columns].iloc[train_indices])
        self.data_train = PandasDataset(scale(df[self.feature_columns].iloc[test_indices]),df[self.target_columns].iloc[test_indices])
        self.data_val = PandasDataset(scale(df[self.feature_columns].iloc[val_indices]),df[self.target_columns].iloc[val_indices])

    def train_dataloader(self):
        return torch_data.DataLoader(self.data_train, batch_size = self.batch_size)

    def test_dataloader(self):
        return torch_data.DataLoader(self.data_train, batch_size = self.batch_size)
    
    def validation_dataloader(self):
        return torch_data.DataLoader(self.data_val, batch_size=self.batch_size)

In [61]:
class SimpleModule(nn.Module):

    def __init__(self, num_features: int):
        super().__init__()
        self.layer = nn.Linear(num_features, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layer(x)


class SeparateMultiLogRegression(nn.Module):

    def __init__(self, num_features: int, num_targets: int):
        super().__init__()
        self.regressors = nn.ModuleList([
            SimpleModule(num_features) for _ in range(num_targets)
        ])
        self.softmax = nn.Softmax()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        xs = torch.cat([reg(x) for reg in self.regressors])
        return self.softmax(xs)


class JointMultiLogRegression(nn.Module):

    def __init__(self, num_features: int, num_targets: int, hidden_layer_size: int):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(num_features, hidden_layer_size),
            nn.ReLU(),
            nn.Linear(hidden_layer_size, num_targets), 
            nn.ReLU(),
            nn.Softmax()
        )
    def forward(self, x):
        return self.seq(x)



In [19]:
class ElectionPredictionModule(pl.LightningModule):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
    
    def training_step(self, batch) -> torch.Tensor:
        x, y = batch
        preds = self.model(x)
        loss = torch.kl_div(preds, y)
        return loss
    def configure_optimizers(self) -> torch.optim.Optimizer:
        optim = torch.optim.Adam(self.parameters())
        return optim
        

In [67]:
data = ElectionDataModule(
    feature_columns=['sl11vs', 'sl11vos', 'sl11nast', 'sl11strm','sl11strb', 'sl11zakl'],
    target_columns=[ 'par21spd',
       'par21spolu', 'par21pirsta', 'par21ano', 'par21soc', 'par21pri',
       'par21ksc', 'par21zel', 'par21tss'],
    stratification_columns=['kraj']
)
model = ElectionPredictionModule(SeparateMultiLogRegression(6,9))
trainer = pl.Trainer(CSVLogger("logs"))
trainer.fit(model, data)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | SeparateMultiLogRegression | 63    
-----------------------------------------------------
63        Trainable params
0         Non-trainable params
63        Total params
0.000     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

RuntimeError: mat1 and mat2 must have the same dtype

In [55]:
data.data_train.targets.index

Int64Index([11647,  3729,  9400, 11145, 12650,  6045,  9766,  7321, 13517,
             7833,
            ...
             7837, 12105,  7089,  5542, 12912,  4750,  4724, 11682, 13848,
            13943],
           dtype='int64', length=12131)