In [1]:
!pip install torch torchvision torchaudio pandas scikit-learn matplotlib




In [3]:
# Imports & device 
import os, sys, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch, torch.nn as nn, torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Python:", sys.version)
print("Torch :", torch.__version__)
print("Device:", device)

Python: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 10:07:17) [Clang 14.0.6 ]
Torch : 2.8.0
Device: cpu


In [5]:
# 1.Load data
TRAIN_CSV = "covid.train.csv"
TEST_CSV  = "covid.test.csv"
assert os.path.exists(TRAIN_CSV) and os.path.exists(TEST_CSV), "Missing train/test CSV files."


train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

ID_COL = "id"
TARGET = "tested_positive.2"

assert TARGET in train_df.columns, "Target column missing!"

# save test ids for the final output file
test_ids = test_df[ID_COL] if ID_COL in test_df.columns else pd.Series(range(len(test_df)))


In [7]:
# Step 2: prepare features and labels
X = train_df.drop(columns=[TARGET]).copy()
y = train_df[TARGET].astype(float).copy()

# drop id column from both train/test
for df in (X, test_df):
    if ID_COL in df.columns:
        df.drop(columns=[ID_COL], inplace=True)

# one-hot encode categoricals
X = pd.get_dummies(X)
X_test = pd.get_dummies(test_df)

# make sure train and test have the same columns
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

# Step 3: train/val split + scaling
X_tr_df, X_va_df, y_tr_sr, y_va_sr = train_test_split(
    X, y, test_size=0.20, random_state=42
)

scaler = MinMaxScaler()
X_tr = scaler.fit_transform(X_tr_df.values)   # fit on train
X_va = scaler.transform(X_va_df.values)
X_te = scaler.transform(X_test.values)


In [9]:
# Step 4: convert to tensor and move to device
X_tr = torch.as_tensor(X_tr, dtype=torch.float32).to(device)
y_tr = torch.as_tensor(y_tr_sr.values, dtype=torch.float32).view(-1, 1).to(device)
X_va = torch.as_tensor(X_va, dtype=torch.float32).to(device)
y_va = torch.as_tensor(y_va_sr.values, dtype=torch.float32).view(-1, 1).to(device)
X_te = torch.as_tensor(X_te, dtype=torch.float32).to(device)


In [11]:
# Step 5: define the model (simple feed-forward network)
class DNN(nn.Module):
    def __init__(self, d_in):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, 128), nn.ReLU(), nn.Dropout(0.30),
            nn.Linear(128, 64),   nn.ReLU(), nn.Dropout(0.30),
            nn.Linear(64, 1)
        )
    def forward(self, x): return self.net(x)

# set up model, loss function and optimizer
model = DNN(X_tr.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)


In [17]:
# 6. training loop with validation and early stopping
best_val = float("inf")
best_state = None
patience = 10
wait = 0
max_epochs = 120

for epoch in range(1, max_epochs + 1):
    # training step
    model.train()
    optimizer.zero_grad()
    pred = model(X_tr)
    loss = criterion(pred, y_tr)
    loss.backward()
    optimizer.step()

    # validation step
    model.eval()
    with torch.no_grad():
        vpred = model(X_va)
        vloss = criterion(vpred, y_va).item()

    # track best model
    if vloss < best_val:
        best_val = vloss
        best_state = model.state_dict()
        wait = 0
    else:
        wait += 1

    # logging
    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d} | train MSE {loss.item():.6f} | val MSE {vloss:.6f}")

    # stop if no improvement
    if wait >= patience:
        print(f"Early stopping @ epoch {epoch} | best val MSE {best_val:.6f}")
        break

# load best model before evaluation
if best_state is not None:
    model.load_state_dict(best_state)

val_rmse = float(np.sqrt(best_val))
print("Validation RMSE:", val_rmse)


Epoch 010 | train MSE 39.650257 | val MSE 33.996925
Epoch 020 | train MSE 37.333080 | val MSE 31.404839
Epoch 030 | train MSE 33.979668 | val MSE 26.099939
Epoch 040 | train MSE 29.100935 | val MSE 22.519878
Epoch 050 | train MSE 25.307653 | val MSE 19.293221
Epoch 060 | train MSE 22.606592 | val MSE 16.021355
Epoch 070 | train MSE 19.289167 | val MSE 13.136954
Epoch 080 | train MSE 16.425604 | val MSE 10.775430
Epoch 090 | train MSE 14.489493 | val MSE 8.809349
Epoch 100 | train MSE 13.069925 | val MSE 7.420266
Epoch 110 | train MSE 12.245246 | val MSE 6.435529
Epoch 120 | train MSE 11.664783 | val MSE 5.848244
Validation RMSE: 2.4183143282493416


In [19]:
# Step 7: predict on test set

model.eval()
with torch.no_grad():
    tpred = model(X_te).detach().cpu().numpy().ravel()

# save predictions together with ids to csv
submission = pd.DataFrame({"id": test_ids, "prediction": tpred})
submission.to_csv("covid_predictions.csv", index=False)
if os.path.exists("covid_predictions.csv"):
    print("covid_predictions.csv saved successfully")


covid_predictions.csv saved successfully
