This problem is for classifying if the location of a provider's office is colocated with another provider's office by using geolocation information. This dataset required downsampling to solve the imbalanced class data issue in the data preprocessing.

In [1]:
import autoreload
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
from google.colab import drive
#drive.mount('/content/')

In [4]:
data = pd.read_csv('./geolocations_20240115.csv')

In [5]:
data[data['normadd'].str.contains('91 tompkins ave')]

Unnamed: 0,address,address2,city,state,countyssa,clientname,geoinfo,npi,name,latitude,longitude,normadd,distance_in_miles,target
29443,91 tompkins ave,,staten island,ny,33610.0,quest_elderplan,Address-Exact,1982971180,"CHRISTOPHER M GLENN, MD",40.622485,-74.078314,91 tompkins ave staten island ny,878.85,0.0


In [6]:
X= data.drop(columns= ['target', 'geoinfo'], axis= 1)
Y= data['target']

In [7]:
Y.value_counts()

0.0    21810
1.0     8827
Name: target, dtype: int64

In [8]:
X.shape, Y.shape

((30637, 12), (30637,))

## this is imbalance dataset with only 27% of data is 1. I downsampled class 0


In [9]:
for col in X.columns:
  if X[col].dtype == 'object':
    X[col] = X[col].fillna('NA')
  else:
    X[col] = X[col].fillna(0)
  le= LabelEncoder()
  X[col] = le.fit_transform(X[col])

In [10]:
#try to make all columns as categorical variables
for col in X.columns:
  X[col] = X[col].astype('category')

In [11]:
X.dtypes

address              category
address2             category
city                 category
state                category
countyssa            category
clientname           category
npi                  category
name                 category
latitude             category
longitude            category
normadd              category
distance_in_miles    category
dtype: object

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, random_state= 42)

In [13]:
y_train.value_counts()

0.0    16330
1.0     6647
Name: target, dtype: int64

In [14]:
y_val.value_counts()

0.0    5480
1.0    2180
Name: target, dtype: int64

In [15]:
train_data= pd.concat([X_train, y_train], axis=1)

In [16]:
class_0= train_data[train_data['target']==0]
class_1= train_data[train_data['target'] ==1]

In [17]:
from sklearn.utils import resample
class_0_downsampled= resample(class_0, replace = False, n_samples= len(class_1), random_state= 43)

In [18]:
downsampled_data= pd.concat([class_0_downsampled, class_1])
downsampled_data= downsampled_data.sample(frac=1, random_state= 42)

In [19]:
X_train_downsampled= downsampled_data.drop('target', axis=1)
y_train_downsampled= downsampled_data['target']

In [20]:
y_train_downsampled.value_counts()

1.0    6647
0.0    6647
Name: target, dtype: int64

In [21]:

X_train_downsampled.reset_index(inplace = True, drop= True)
X_val.reset_index(inplace = True, drop= True)

In [22]:
emb_c= {n: len(col.cat.categories) for n, col in X.items() if len(col.cat.categories) >2}

In [23]:
emb_c

{'address': 24102,
 'address2': 1810,
 'city': 4260,
 'state': 52,
 'countyssa': 1857,
 'clientname': 77,
 'npi': 28724,
 'name': 23362,
 'latitude': 22919,
 'longitude': 22932,
 'normadd': 24353,
 'distance_in_miles': 20569}

In [24]:
#size of the categories, size of the embedding
emb_szs= [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
emb_szs

[(24102, 50),
 (1810, 50),
 (4260, 50),
 (52, 26),
 (1857, 50),
 (77, 39),
 (28724, 50),
 (23362, 50),
 (22919, 50),
 (22932, 50),
 (24353, 50),
 (20569, 50)]

##Dataset

In [25]:
emb_cols= emb_c.keys()

In [26]:
X.columns

Index(['address', 'address2', 'city', 'state', 'countyssa', 'clientname',
       'npi', 'name', 'latitude', 'longitude', 'normadd', 'distance_in_miles'],
      dtype='object')

In [27]:
X.dtypes

address              category
address2             category
city                 category
state                category
countyssa            category
clientname           category
npi                  category
name                 category
latitude             category
longitude            category
normadd              category
distance_in_miles    category
dtype: object

In [28]:
emb_cols

dict_keys(['address', 'address2', 'city', 'state', 'countyssa', 'clientname', 'npi', 'name', 'latitude', 'longitude', 'normadd', 'distance_in_miles'])

In [29]:
class GeoLocationDataset(Dataset):
  def __init__(self, X, Y, emb_cols):
    X= X.copy()
    self.X= X.loc[: ,emb_cols].copy().values.astype(np.int64)
    self.y= Y.copy().values.astype(np.float64)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):

    return self.X[idx,:], self.y[idx]

In [30]:
train_ds= GeoLocationDataset(X_train_downsampled, y_train_downsampled, emb_cols)
valid_ds= GeoLocationDataset(X_val, y_val, emb_cols)

In [31]:
train_ds[2]

(array([10946,   242,    32,    32,   997,    34, 16230,  4033,  5523,
         4291, 11061, 15690]),
 0.0)

In [32]:
emb_cols = ['address', 'address2', 'city', 'state', 'countyssa', 'clientname', 'npi', 'name', 'latitude', 'longitude', 'normadd', 'distance_in_miles']
X1= X.loc[: ,emb_cols].copy().values.astype('str')
X1

array([['19075', '242', '3604', ..., '6594', '19282', '0'],
       ['24071', '228', '324', ..., '16384', '0', '12276'],
       ['4', '242', '22', ..., '12816', '1', '4291'],
       ...,
       ['24098', '242', '2309', ..., '8618', '24350', '0'],
       ['24099', '242', '4002', ..., '8550', '24351', '256'],
       ['24101', '242', '437', ..., '20193', '24352', '9453']],
      dtype='<U21')

In [33]:
len(valid_ds)

7660

In [34]:
train_ds[3]

(array([15650,   242,  2597,    34,  1074,    49,  4662, 14175, 15631,
        21066, 15823,     1]),
 1.0)

In [35]:
valid_ds[0]

(array([22291,   242,   437,    34,  1038,    76, 12335, 14175, 12651,
        19068, 22534,     0]),
 1.0)

In [36]:
class CategoricalEmbeddingModel(nn.Module):
  def __init__(self, emb_szs):
    super().__init__()
    self.embs= nn.ModuleList([nn.Embedding(c, s) for c, s in emb_szs])
    n_emb= sum(e.embedding_dim for e in self.embs)
    self.n_emb= n_emb
    self.lin1= nn.Linear(self.n_emb, 100)
    self.lin2= nn.Linear(100,1)
    self.bn= nn.BatchNorm1d(100)
    self.emb_dropout= nn.Dropout(0.5)
    self.drops= nn.Dropout(0.2)

  def forward(self, x_cat):
    #This is applying the current embedding layer e to the corresponding column of the input tensor. It's essentially embedding the categorical values in that column.
    x= [e(x_cat[:, i]) for i, e in enumerate(self.embs)]
    x = torch.cat(x, 1)
    x= self.emb_dropout(x)
    x= F.relu(self.lin1(x))
    x= self.drops(x)
    x= self.bn(x)
    x= self.lin2(x)
    return x

In [37]:
model= CategoricalEmbeddingModel(emb_szs)

In [38]:
embs= nn.ModuleList([nn.Embedding(c,s) for c, s in emb_szs])

In [39]:
batch_size= 5
train_dl = DataLoader(train_ds, batch_size= batch_size, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size= batch_size)

In [40]:
x, y= next(iter(train_dl))

In [41]:
print(x.shape, y.shape)

torch.Size([5, 12]) torch.Size([5])


In [42]:
x[:, 0]

tensor([15704, 14719,   532, 23975, 11606])

In [43]:
embs[0](x[:, 0])

tensor([[ 2.9480e-02,  1.1853e+00, -1.0595e+00,  2.9736e+00,  7.3951e-02,
         -4.1447e-01, -1.8417e-01, -1.3354e+00,  1.6215e-01, -2.0567e+00,
          1.1003e-01,  1.1382e+00,  7.2917e-01,  1.4582e-01, -1.1971e+00,
          3.4688e-01, -3.7979e-01, -4.6428e-01,  5.0880e-01, -8.5336e-01,
         -3.6553e-01, -3.4864e-01, -1.3076e+00, -2.1247e-01,  2.8278e-01,
         -7.0840e-01, -1.9774e+00,  2.6458e+00,  2.8724e-01, -1.2103e+00,
         -1.3571e+00,  5.4796e-01, -1.6534e+00,  1.3294e+00,  1.0390e+00,
         -1.3836e-02,  1.3538e+00, -9.8408e-01,  1.7309e-01, -3.1489e-01,
          8.8064e-01,  8.1214e-01,  3.6918e-01, -4.6785e-01,  3.6859e-01,
         -1.7587e-01, -7.8265e-01,  2.1834e+00,  2.2962e-01,  8.7300e-02],
        [ 5.4093e-01,  9.5017e-01, -2.7084e-01, -4.9587e-01,  1.8069e+00,
         -2.2971e-01,  1.4598e-01,  6.5746e-01,  1.0239e+00, -7.5929e-01,
          9.0655e-02, -1.5700e+00, -2.9649e+00, -2.4750e-02,  2.8532e-01,
         -7.5063e-01, -9.9414e-01,  7

In [44]:
y= y.unsqueeze(1)


In [45]:
out= model(x)
out

tensor([[ 0.6679],
        [-0.3905],
        [ 0.2090],
        [-1.1180],
        [ 1.1151]], grad_fn=<AddmmBackward0>)

In [46]:
out.shape

torch.Size([5, 1])

In [47]:
pred= (out> 0.0).float()
(pred== y).float().sum()

tensor(3.)

In [48]:
F.binary_cross_entropy_with_logits(out, y)

tensor(0.7194, dtype=torch.float64,
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [49]:
def get_optimizer(model, lr = 0.01, wd= 0.0):
  parameters = filter(lambda p: p.requires_grad, model.parameters())
  optim= torch.optim.Adam(parameters, lr= lr, weight_decay = wd)
  return optim

In [50]:
def train_model(model, optim, train_dl= train_dl, verbose= False):
  model.train()
  total= 0
  sum_loss= 0
  for x, y in train_dl:
    batch= y.shape[0]
    y= y.unsqueeze(1)
    out= model(x)
    loss = F.binary_cross_entropy_with_logits(out, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    total += batch
    sum_loss += loss.item() * batch
    if verbose:
      print(sum_loss/total)
  return sum_loss/total

In [51]:
def val_metrics(model, valid_dl):
  model.eval()
  total= 0
  sum_loss= 0
  correct = 0
  for x, y in valid_dl:
    batch = y.shape[0]
    y = y.unsqueeze(1)
    out= model(x)
    loss= F.binary_cross_entropy_with_logits(out, y)
    sum_loss += loss.item() * batch
    total += batch
    pred= (out >0).float()
    correct += (pred== y).float().sum().item()
  return sum_loss/total, correct/total


In [52]:
from datetime import datetime

def train_loop(model, epochs, lr = 0.01, wd= 0.0):
  optim = get_optimizer(model, lr= lr, wd= wd)
  for i in range(epochs):
    loss= train_model(model, optim, train_dl)
    val_loss, val_acc= val_metrics(model, valid_dl)
    print("train loss %.3f val loss %.3f and accuracy %.3f" %(
        loss, val_loss, val_acc
    ))

In [53]:
batch_size = 500
train_dl= DataLoader(train_ds, batch_size = batch_size, shuffle= True)
valid_dl= DataLoader(valid_ds, batch_size = batch_size)

In [54]:
model = CategoricalEmbeddingModel(emb_szs)

In [55]:
train_loop(model, epochs= 5, lr= 0.03, wd= 0.01)

train loss 0.288 val loss 0.201 and accuracy 0.969
train loss 0.172 val loss 0.091 and accuracy 0.983
train loss 0.114 val loss 0.073 and accuracy 0.992
train loss 0.079 val loss 0.068 and accuracy 0.989
train loss 0.058 val loss 0.045 and accuracy 0.997


## Model Evalutions with Precision, Recall, and F1 Score

In [56]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [57]:
def threshold_predictions(out, threshold = 0.5):
  return (out > threshold).float()

def evaluate_metrics(model, valid_dl, threshold =0.5):
  model.eval()
  all_predictions = []
  all_labels = []

  with torch.no_grad():
    for x, y in valid_dl:
      logits = model(x)
      predictions = threshold_predictions(logits, threshold)
      all_predictions.extend(predictions.cpu().numpy())
      all_labels.extend(y.cpu().numpy())
  all_predictions = np.array(all_predictions)
  all_labels= np.array(all_labels)

  binary_predictions= (all_predictions > threshold).astype(int)
  binary_labels = (all_labels > 0.5).astype(int)

  precision= precision_score(binary_labels, binary_predictions, zero_division= 1)
  recall= recall_score(binary_labels, binary_predictions)
  f1 = f1_score(binary_labels, binary_predictions)

  return precision, recall, f1



In [58]:
precision, recall, f1 = evaluate_metrics(model, valid_dl)
print(f"Precision: {precision: .4f}, Recall: {recall: .4f}, F1: {f1: .4f}")


Precision:  0.9991, Recall:  0.9876, F1:  0.9933


## Cross Validations

In [59]:
from sklearn.model_selection import StratifiedKFold

In [60]:
n_splits= 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 42)
downsample_ratio =1.0

precision_scores, recall_scores, f1_scores = [], [], []

for train_index, valid_index in skf.split(X, Y):
  X_train_t, X_valid_t = X.iloc[train_index], X.iloc[valid_index]
  y_train_t, y_valid_t = Y.iloc[train_index], Y.iloc[valid_index]

  class_0= train_data[train_data['target'] ==0]
  class_1= train_data[train_data['target']==1]

  class_0_downsampled = resample(class_0, replace= False, n_samples= int(len(class_1) * downsample_ratio), random_state=43)

  downsampled_data= pd.concat([class_0_downsampled, class_1])
  downsampled_data= downsampled_data.sample(frac=1, random_state = 42)

  X_train_downsampled = downsampled_data.drop('target', axis=1)
  y_train_downsampled = downsampled_data['target']

  model_t = CategoricalEmbeddingModel(emb_szs)
  batch_size = 500
  train_ds_t= GeoLocationDataset(X_train_downsampled, y_train_downsampled, emb_cols)
  valid_ds_t= GeoLocationDataset(X_valid_t, y_valid_t, emb_cols)

  train_dl= DataLoader(train_ds_t, batch_size = batch_size, shuffle= True)
  valid_dl= DataLoader(valid_ds_t, batch_size = batch_size)

  train_loop(model_t, epochs=5, lr=0.03, wd=0.01)

  precision_val, recall_val, f1_val = evaluate_metrics(model_t, valid_dl)
  precision_scores.append(precision_val)
  recall_scores.append(recall_val)
  f1_scores.append(f1_val)

# Calculate the mean and standard deviation of the metrics
mean_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)
mean_recall = np.mean(recall_scores)
std_recall = np.std(recall_scores)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print(f"Mean Precision: {mean_precision:.4f} +/- {std_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f} +/- {std_recall:.4f}")
print(f"Mean F1 Score: {mean_f1:.4f} +/- {std_f1:.4f}")

train loss 0.321 val loss 0.226 and accuracy 0.939
train loss 0.189 val loss 0.091 and accuracy 0.982
train loss 0.109 val loss 0.078 and accuracy 0.990
train loss 0.078 val loss 0.058 and accuracy 0.992
train loss 0.057 val loss 0.047 and accuracy 0.995
train loss 0.305 val loss 0.195 and accuracy 0.965
train loss 0.180 val loss 0.096 and accuracy 0.984
train loss 0.117 val loss 0.075 and accuracy 0.989
train loss 0.076 val loss 0.054 and accuracy 0.995
train loss 0.053 val loss 0.036 and accuracy 0.997
train loss 0.271 val loss 0.182 and accuracy 0.969
train loss 0.169 val loss 0.091 and accuracy 0.987
train loss 0.109 val loss 0.072 and accuracy 0.992
train loss 0.080 val loss 0.053 and accuracy 0.995
train loss 0.055 val loss 0.044 and accuracy 0.996
train loss 0.300 val loss 0.177 and accuracy 0.965
train loss 0.181 val loss 0.107 and accuracy 0.986
train loss 0.113 val loss 0.064 and accuracy 0.991
train loss 0.077 val loss 0.057 and accuracy 0.993
train loss 0.057 val loss 0.037