In [1]:
import os
import glob
import numpy as np
import torch
from torch import nn, optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
import pyodbc


In [None]:
IMAGE_DIR = "project_p2/medical_images_subset"
image_paths = glob.glob(os.path.join(IMAGE_DIR, "*"))

len(image_paths)


20

In [7]:
def get_label(filename):
    fname = filename.lower()
    if "covid" in fname:
        return 1
    else:
        return 0

labels = [get_label(p) for p in image_paths]
list(zip(image_paths[:5], labels[:5]))


[('/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p23-108%9.png',
  1),
 ('/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/noncovid_25.png',
  1),
 ('/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.03.01.20029769-p21-73_1%0.png',
  1),
 ('/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p24-111%8.png',
  1),
 ('/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.22.20024927-p20-70%0.png',
  1)]

In [8]:
class CTDataset(Dataset):
    def __init__(self, image_paths):
        self.image_paths = image_paths
        self.labels = [get_label(p) for p in image_paths]

        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        image = Image.open(path).convert("RGB")
        label = self.labels[idx]
        image = self.transform(image)
        return image, label, path


In [9]:
from sklearn.model_selection import train_test_split

train_paths, val_paths = train_test_split(image_paths, test_size=0.3, random_state=42)

train_ds = CTDataset(train_paths)
val_ds = CTDataset(val_paths)

train_dl = DataLoader(train_ds, batch_size=4, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=4, shuffle=False)


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = models.resnet18(weights="IMAGENET1K_V1")
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)

model = model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [11]:
def train_model(model, train_dl, val_dl, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for imgs, labels, _ in train_dl:
            imgs = imgs.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for imgs, labels, _ in val_dl:
                imgs = imgs.to(device)
                labels = labels.float().unsqueeze(1).to(device)
                outputs = model(imgs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                preds = (outputs > 0.5).int()
                correct += (preds == labels.int()).sum().item()
                total += labels.size(0)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.3f}, Val Loss={val_loss:.3f}, Val Acc={correct/total:.2f}")

train_model(model, train_dl, val_dl, epochs=5)


Epoch 1: Train Loss=1.831, Val Loss=1.098, Val Acc=1.00
Epoch 2: Train Loss=1.578, Val Loss=1.140, Val Acc=0.83
Epoch 3: Train Loss=1.403, Val Loss=1.248, Val Acc=0.67
Epoch 4: Train Loss=1.174, Val Loss=1.212, Val Acc=0.67
Epoch 5: Train Loss=0.949, Val Loss=0.796, Val Acc=0.83


In [None]:
# Save CT Risk Model for use in e2e_pipeline
import os
os.makedirs('models', exist_ok=True)
torch.save(model.state_dict(), '/models/ct_risk_model.pth')
print("CT Risk Model saved to models/ct_risk_model.pth")


CT Risk Model saved to models/ct_risk_model.pth


In [14]:
model.eval()
risk_scores = {}

with torch.no_grad():
    for path in image_paths:
        img = Image.open(path).convert("RGB")
        img = train_ds.transform(img).unsqueeze(0).to(device)
        score = model(img).item()
        risk_scores[path] = score

risk_scores


{'/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p23-108%9.png': 0.7002732157707214,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/noncovid_25.png': 0.5397704243659973,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.03.01.20029769-p21-73_1%0.png': 0.7902732491493225,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p24-111%8.png': 0.6054553389549255,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.22.20024927-p20-70%0.png': 0.7533910274505615,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/noncovid_11%1.jpg': 0.7402088642120361,
 '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_bmj.m606.full-p4-22%2.png': 0.5

In [15]:
customer_ids = list(range(1, len(image_paths)+1))
mapped_scores = list(zip(customer_ids, list(risk_scores.values()), image_paths))

mapped_scores[:5]


[(1,
  0.7002732157707214,
  '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p23-108%9.png'),
 (2,
  0.5397704243659973,
  '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/noncovid_25.png'),
 (3,
  0.7902732491493225,
  '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.03.01.20029769-p21-73_1%0.png'),
 (4,
  0.6054553389549255,
  '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.25.20021568-p24-111%8.png'),
 (5,
  0.7533910274505615,
  '/Users/grishmadeshmukh/Desktop/NYUMasters/Sem3/Database/project_p2/medical_images_subset/covid_2020.02.22.20024927-p20-70%0.png')]

In [None]:
import pymssql
SERVER = "server"
DATABASE = "db"
USERNAME = "username"
PASSWORD = "password"

conn = pymssql.connect(
    server= SERVER,
    user= USERNAME,
    password= PASSWORD,
    database= DATABASE,
    port=1433,
    tds_version="7.4"
)
cursor = conn.cursor()


In [18]:
for cust_id, score, path in mapped_scores:
    cursor.execute("""
        INSERT INTO CustomerHealthFactor
        (CustomerID, FactorName, FactorValue, FactorYear, SourceAssetID)
        VALUES (%s, 'CT_RiskScore', %s, YEAR(GETDATE()), NULL)
    """, (cust_id, float(score)))

conn.commit()
print("Risk scores inserted successfully!")


Risk scores inserted successfully!


In [19]:
cursor.execute("SELECT TOP 20 * FROM CustomerHealthFactor")
cursor.fetchall()


[(1,
  1,
  'covid_2020.02.10.20021584-p6-52%10.png',
  'COVID Infection Risk',
  0.88,
  2024,
  1),
 (2,
  2,
  'covid_2020.02.13.20022673-p13-77%1.png',
  'COVID Infection Risk',
  0.82,
  2024,
  1),
 (3,
  3,
  'covid_2020.02.22.20024927-p20-70%0.png',
  'COVID Infection Risk',
  0.76,
  2024,
  1),
 (4,
  4,
  'covid_2020.02.25.20021568-p23-108%9.png',
  'COVID Infection Risk',
  0.91,
  2024,
  1),
 (5,
  5,
  'covid_2020.02.25.20021568-p24-111%8.png',
  'COVID Infection Risk',
  0.89,
  2024,
  1),
 (6,
  6,
  'covid_2020.02.25.20027763-p15-53%0.png',
  'COVID Infection Risk',
  0.65,
  2024,
  1),
 (7,
  7,
  'covid_2020.03.01.20029769-p21-73_1%0.png',
  'COVID Infection Risk',
  0.74,
  2024,
  1),
 (8,
  8,
  'covid_2020.03.03.20030775-p10-88%1.png',
  'COVID Infection Risk',
  0.93,
  2024,
  1),
 (9,
  9,
  'covid_2020.03.16.20035105-p6-64-1.png',
  'COVID Infection Risk',
  0.79,
  2024,
  1),
 (10,
  10,
  'covid_bmj.m606.full-p4-22%2.png',
  'COVID Infection Risk',
  0.

# CITY WELLNESS MODEL

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("raw/city_wellness.csv", on_bad_lines='skip')

df["Obesity levels(Country)"] = df["Obesity levels(Country)"].str.replace("%", "", regex=False)
df["Cost of a bottle of water(City)"] = df["Cost of a bottle of water(City)"].str.replace("£", "").str.replace("$", "").str.replace("€", "")
df["Cost of a monthly gym membership(City)"] = df["Cost of a monthly gym membership(City)"].str.replace("£", "").str.replace("$", "").str.replace("€", "")

numeric_cols = [
    "Sunshine hours(City)",
    "Obesity levels(Country)",
    "Life expectancy(years) (Country)",
    "Pollution(Index score) (City)",
    "Annual avg. hours worked",
    "Happiness levels(Country)",
    "Outdoor activities(City)",
    "Cost of a bottle of water(City)",
    "Cost of a monthly gym membership(City)"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

df["WellnessScore"] = (
    0.15 * (df["Sunshine hours(City)"] / df["Sunshine hours(City)"].max()) +
    0.20 * (df["Happiness levels(Country)"] / df["Happiness levels(Country)"].max()) +
    0.15 * (df["Life expectancy(years) (Country)"] / df["Life expectancy(years) (Country)"].max()) +
    0.10 * (df["Outdoor activities(City)"] / df["Outdoor activities(City)"].max()) +
    0.10 * (1 - df["Obesity levels(Country)"] / df["Obesity levels(Country)"].max()) +
    0.10 * (1 - df["Pollution(Index score) (City)"] / df["Pollution(Index score) (City)"].max()) +
    0.10 * (1 - df["Annual avg. hours worked"] / df["Annual avg. hours worked"].max()) +
    0.10 * (1 - df["Cost of a monthly gym membership(City)"] / df["Cost of a monthly gym membership(City)"].max())
)

df[["City", "WellnessScore"]].head(10)


Unnamed: 0,City,WellnessScore
0,Amsterdam,0.683076
1,Sydney,0.667695
2,Vienna,0.655445
3,Stockholm,0.637824
4,Copenhagen,0.648478
5,Helsinki,0.638817
6,Fukuoka,0.602497
7,Berlin,0.633742
8,Barcelona,0.654622
9,Vancouver,0.623754


In [26]:
from sklearn.preprocessing import StandardScaler


In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numeric_cols])

features = scaled_features
labels = df["WellnessScore"].values.reshape(-1, 1)
X_train, X_val, y_train, y_val = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

class CityDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = CityDataset(X_train, y_train)
val_ds = CityDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8, shuffle=False)

class WellnessNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(len(numeric_cols), 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.net(x)

model = WellnessNN()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(300):
    model.train()
    for Xb, yb in train_dl:
        optimizer.zero_grad()
        pred = model(Xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
    
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss = {loss.item():.6f}")

model.eval()
X_all = torch.tensor(scaled_features, dtype=torch.float32)
df["PredictedWellness"] = model(X_all).detach().numpy().flatten()

print(df[["City", "WellnessScore", "PredictedWellness"]].head(10))

import pickle
os.makedirs('/models', exist_ok=True)

torch.save(model.state_dict(), 'models/wellness_model.pth')
print("Wellness Model saved to models/wellness_model.pth")

with open('/models/wellness_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Wellness Scaler saved to models/wellness_scaler.pkl")

Epoch 0, Loss = 0.243354
Epoch 50, Loss = 0.000043
Epoch 100, Loss = 0.000001
Epoch 150, Loss = 0.000002
Epoch 200, Loss = 0.000001
Epoch 250, Loss = 0.000639
         City  WellnessScore  PredictedWellness
0   Amsterdam       0.683076           0.685419
1      Sydney       0.667695           0.668767
2      Vienna       0.655445           0.654530
3   Stockholm       0.637824           0.634480
4  Copenhagen       0.648478           0.633085
5    Helsinki       0.638817           0.635829
6     Fukuoka       0.602497           0.601052
7      Berlin       0.633742           0.633372
8   Barcelona       0.654622           0.643666
9   Vancouver       0.623754           0.621229
Wellness Model saved to models/wellness_model.pth
Wellness Scaler saved to models/wellness_scaler.pkl


In [29]:
cursor.execute("SELECT COUNT(*) FROM Customer")
n_customers = cursor.fetchone()[0]
n_customers


20

In [30]:
# get number of customers
cursor.execute("SELECT COUNT(*) FROM Customer")
n_customers = cursor.fetchone()[0]
print("Number of customers:", n_customers)

# build insert rows only for existing customers
insert_rows = []

for idx, row in df.head(n_customers).iterrows():  # LIMIT to real customers
    insert_rows.append(
        (
            idx + 1,                        # CustomerID (1..n_customers)
            "City_WellnessScore_Model",     # FactorName
            float(row["PredictedWellness"]),# FactorValue
            2024,                           # FactorYear
            None                            # SourceAssetID
        )
    )

# insert
query = """
INSERT INTO CustomerHealthFactor
(CustomerID, FactorName, FactorValue, FactorYear, SourceAssetID)
VALUES (%s, %s, %s, %s, %s)
"""

cursor.executemany(query, insert_rows)
conn.commit()

print("Inserted", len(insert_rows), "records into CustomerHealthFactor successfully!")


Number of customers: 20
Inserted 20 records into CustomerHealthFactor successfully!
