In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv(r"C:\Users\lkneh\HealthScore-Predictor\data\clean\HealthInspections.csv")
df["failFlag"].value_counts(dropna=False)


failFlag
0    29822
1     2275
Name: count, dtype: int64

In [90]:
df["inspection_date"] = pd.to_datetime(
    dict(year=df["insp_year"], month=df["insp_month"], day=df["insp_day"])
)
train_mask = df["inspection_date"] <= "2023-12-31"
val_mask   = (df["inspection_date"] >= "2024-01-01") & (df["inspection_date"] <= "2024-12-31")
test_mask  = df["inspection_date"] >= "2025-01-01"


train_df = df[train_mask].copy()
val_df   = df[val_mask].copy()
test_df  = df[test_mask].copy()
train_df=train_df.drop(columns=["inspection_date"])
val_df=val_df.drop(columns=["inspection_date"])
test_df=test_df.drop(columns=["inspection_date"])
# print(test_df.columns)

test_df.to_csv("../../data/clean/HealthInspectionTest.csv",index=False)
val_df.to_csv("../../data/clean/HealthInspectionVal.csv",index=False)
train_df['failFlag'].value_counts(dropna=False)


failFlag
0    15576
1     1333
Name: count, dtype: int64

In [73]:
label_col = "failFlag"

# 1) Separate features and label
X = train_df.drop(columns=[label_col]).values      # all columns you listed
y = df[label_col].values.astype(np.float32)  # 0/1

# 2) Scale features to [-1, 1] (good for Tanh generator) [web:120]
scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)

# 3) Convert to tensors
data = torch.tensor(X_scaled, dtype=torch.float32)
labels = torch.tensor(y, dtype=torch.float32)

input_dim = data.shape[1]        # number of features
latent_dim = 20                  # noise size (tunable)



In [74]:
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Tanh(),           # matches [-1, 1] scaling
        )

    def forward(self, z):
        return self.model(z)


class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1),    # logits (no Sigmoid)
        )

    def forward(self, x):
        return self.model(x)

generator = Generator(latent_dim, input_dim)
discriminator = Discriminator(input_dim)

criterion = nn.BCEWithLogitsLoss()          # works with logits [web:102]
lr = 2e-4
optimizer_g = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_d = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))

In [75]:
num_epochs = 300
batch_size = 128

for epoch in range(num_epochs):
    for _ in range(len(data) // batch_size):
        # ----- Train D -----
        optimizer_d.zero_grad()

        idx = torch.randint(len(data), (batch_size,))
        real_data = data[idx]
        real_labels = torch.ones(batch_size, 1)

        z = torch.randn(batch_size, latent_dim)
        fake_data = generator(z).detach()
        fake_labels = torch.zeros(batch_size, 1)

        out_real = discriminator(real_data)
        out_fake = discriminator(fake_data)

        loss_real = criterion(out_real, real_labels)
        loss_fake = criterion(out_fake, fake_labels)
        d_loss = loss_real + loss_fake
        d_loss.backward()
        optimizer_d.step()

        # ----- Train G -----
        optimizer_g.zero_grad()
        z = torch.randn(batch_size, latent_dim)
        fake_data = generator(z)
        out_fake = discriminator(fake_data)

        g_loss = criterion(out_fake, real_labels)  # want D to think fake is real
        g_loss.backward()
        optimizer_g.step()

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} | D: {d_loss.item():.4f} | G: {g_loss.item():.4f}")


Epoch 50/300 | D: 1.2208 | G: 0.8508
Epoch 100/300 | D: 1.3875 | G: 0.7010
Epoch 150/300 | D: 1.3404 | G: 0.7296
Epoch 200/300 | D: 1.3626 | G: 0.7215
Epoch 250/300 | D: 1.3674 | G: 0.7029
Epoch 300/300 | D: 1.1440 | G: 0.7965


In [82]:
n_synth = 9000
z = torch.randn(n_synth, latent_dim)
synth_scaled = generator(z).detach().cpu().numpy()

# invert scaling back to original feature space
synth_X = scaler.inverse_transform(synth_scaled)

synth_df = pd.DataFrame(synth_X, columns=train_df.drop(columns=[label_col]).columns)

#minority class = 1
synth_df[label_col] = 1.0

synth_df.to_csv("../../data/clean/HealthInspectionsynthetic.csv", index=False)

In [83]:
df_syn=pd.read_csv("../../data/clean/HealthInspectionsynthetic.csv")
df_syn["failFlag"].value_counts(dropna=False)

failFlag
1.0    9000
Name: count, dtype: int64

In [None]:
#there are inspection type feature with continuos number check if this might imporove the model?

# cat_cols = [c for c in synth_df.columns if c.startswith("inspection_type_clean_")]

# def collapse_one_hot(row):
#     vals = row[cat_cols].values.astype(float)
#     idx = np.argmax(vals)             # position of max logit
#     one_hot = np.zeros_like(vals)
#     one_hot[idx] = 1.0
#     row[cat_cols] = one_hot
#     return row

# df_syn = synth_df.apply(collapse_one_hot, axis=1)

In [85]:
df_balanced = pd.concat([train_df, df_syn], ignore_index=True)
df_balanced.to_csv("../../data/clean/HealthInspectionBalancedTrain.csv", index=False)
df_balanced["failFlag"].value_counts(dropna=False)

failFlag
0.0    15576
1.0    10333
Name: count, dtype: int64