In [6]:
import torch
import scipy.sparse as sp
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.optim import Adam
from sklearn.preprocessing import LabelEncoder

# Load the processed data
combined_sparse = sp.load_npz('data/combined_sparse.npz')
df_targets = pd.read_csv('data/df_targets.csv')
df_targets['PRIM_CONTRIBUTORY_CAUSE'] = pd.Categorical(df_targets['PRIM_CONTRIBUTORY_CAUSE'])

# Encode the target variable
label_encoder = LabelEncoder()
df_targets_encoded = pd.DataFrame()
df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'] = label_encoder.fit_transform(df_targets['PRIM_CONTRIBUTORY_CAUSE'].cat.codes)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    combined_sparse,
    df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'].values,
    test_size=0.2,
    random_state=42
)

# Convert the target arrays to PyTorch tensors
y_train_tensor = torch.tensor(y_train).long()
y_test_tensor = torch.tensor(y_test).long()

# Set the device to use for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Move the data to the device
x_train = torch.sparse_coo_tensor(x_train.nonzero(), x_train.data, x_train.shape, device=device).float()
x_test = torch.sparse_coo_tensor(x_test.nonzero(), x_test.data, x_test.shape, device=device).float()
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# Create the training dataset and data loader
train_dataset = TensorDataset(x_train, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

# Create an instance of the logistic regression model
model = LogisticRegression(input_size=combined_sparse.shape[1], num_classes=len(label_encoder.classes_))

# Move the model to the device
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for batch_data, batch_targets in train_loader:
        # Move the batch data and targets to the device
        batch_data = batch_data.to(device)
        batch_targets = batch_targets.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        logits = model(batch_data)

        # Calculate the loss
        loss = criterion(logits, batch_targets)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    # Forward pass
    logits = model(x_test)

    # Get the predicted labels
    _, predicted = torch.max(logits.data, 1)

    # Calculate the accuracy
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)

print("Accuracy:", accuracy)


cuda


  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():


Accuracy: 0.8054405677114134


In [7]:
import torch
import scipy.sparse as sp
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.optim import Adam
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Load the processed data
combined_sparse = sp.load_npz('data/combined_sparse.npz')
df_targets = pd.read_csv('data/df_targets.csv')
df_targets['PRIM_CONTRIBUTORY_CAUSE'] = pd.Categorical(df_targets['PRIM_CONTRIBUTORY_CAUSE'])

# Encode the target variable
label_encoder = LabelEncoder()
df_targets_encoded = pd.DataFrame()
df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'] = label_encoder.fit_transform(df_targets['PRIM_CONTRIBUTORY_CAUSE'].cat.codes)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    combined_sparse,
    df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'].values,
    test_size=0.2,
    random_state=42
)

# Apply Random Oversampling to the training set
ros = RandomOverSampler(random_state=42)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)

# Convert the target arrays to PyTorch tensors
y_train_tensor = torch.tensor(y_train_resampled).long()
y_test_tensor = torch.tensor(y_test).long()

# Set the device to use for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Move the data to the device
x_train = torch.sparse_coo_tensor(x_train_resampled.nonzero(), x_train_resampled.data, x_train_resampled.shape, device=device).float()
x_test = torch.sparse_coo_tensor(x_test.nonzero(), x_test.data, x_test.shape, device=device).float()
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# Create the training dataset and data loader
train_dataset = TensorDataset(x_train, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

# Create an instance of the logistic regression model
model = LogisticRegression(input_size=combined_sparse.shape[1], num_classes=len(label_encoder.classes_))

# Move the model to the device
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for batch_data, batch_targets in train_loader:
        # Move the batch data and targets to the device
        batch_data = batch_data.to(device)
        batch_targets = batch_targets.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        logits = model(batch_data)

        # Calculate the loss
        loss = criterion(logits, batch_targets)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    # Forward pass
    logits = model(x_test)

    # Get the predicted labels
    _, predicted = torch.max(logits.data, 1)

    # Calculate the accuracy
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)

print("Accuracy:", accuracy)


cuda


  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
