In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import gc

In [2]:
df = pd.read_csv('/kaggle/input/nest-competition/usecase_2_.csv')
df = df.rename(columns={"NCT Number": "nct_id"})
facilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/facilities_cleaned.csv')
eligibilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/eligibilities_cleaned.csv')
drop_withdrawals_cleaned = pd.read_csv('/kaggle/input/nest-competition/drop_withdrawals_cleaned.csv')

In [10]:
X_train_num = X_train_num.astype('float32')
X_test_num = X_test_num.astype('float32')
X_train_text = X_train_text.astype('float32')
X_test_text = X_test_text.astype('float32')

del text_features, all_text_features

In [11]:
gc.collect()

0

In [12]:
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [13]:
class TextEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim, dropout=0.2):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, latent_dim)
        )
        
    def forward(self, x):
        return self.encoder(x)

class TextDecoder(nn.Module):
    def __init__(self, latent_dim, output_dim, dropout=0.2):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, output_dim)
        )
        
    def forward(self, x):
        return self.decoder(x)

class EnrollmentPredictor(nn.Module):
    def __init__(self, latent_dim, num_features, dropout=0.2):
        super().__init__()
        combined_dim = latent_dim + num_features
        self.predictor = nn.Sequential(
            nn.Linear(combined_dim, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
        
    def forward(self, latent, num_features):
        combined = torch.cat([latent, num_features], dim=1)
        return self.predictor(combined)

class EnrollmentModel(nn.Module):
    def __init__(self, text_dim, num_features, latent_dim=128):
        super().__init__()
        self.text_encoder = TextEncoder(text_dim, latent_dim)
        self.text_decoder = TextDecoder(latent_dim, text_dim)
        self.predictor = EnrollmentPredictor(latent_dim, num_features)
        
    def forward(self, text_features, num_features):
        latent = self.text_encoder(text_features)
        reconstructed = self.text_decoder(latent)
        prediction = self.predictor(latent, num_features)
        return prediction, reconstructed, latent

# Training utilities
class EnrollmentDataset(Dataset):
    def __init__(self, text_features, num_features, targets):
        self.text_features = torch.FloatTensor(text_features.toarray())
        self.num_features = torch.FloatTensor(num_features)
        self.targets = torch.FloatTensor(targets)
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return (self.text_features[idx], self.num_features[idx], self.targets[idx])

def train_epoch(model, train_loader, optimizer, device, alpha=0.3):
    model.train()
    total_loss = 0
    for text_batch, num_batch, target_batch in train_loader:
        text_batch = text_batch.to(device)
        num_batch = num_batch.to(device)
        target_batch = target_batch.to(device).unsqueeze(1)
        
        optimizer.zero_grad()
        pred, reconstructed, _ = model(text_batch, num_batch)
        
        # Combine reconstruction loss and prediction loss
        recon_loss = F.mse_loss(reconstructed, text_batch)
        pred_loss = F.mse_loss(pred, target_batch)
        loss = alpha * recon_loss + (1 - alpha) * pred_loss
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, device):
    model.eval()
    predictions = []
    targets = []
    
    with torch.no_grad():
        for text_batch, num_batch, target_batch in val_loader:
            text_batch = text_batch.to(device)
            num_batch = num_batch.to(device)
            pred, _, _ = model(text_batch, num_batch)
            predictions.extend(pred.cpu().numpy())
            targets.extend(target_batch.numpy())
    
    predictions = np.array(predictions)
    targets = np.array(targets)
    
    rmse = np.sqrt(mean_squared_error(targets, predictions))
    r2 = r2_score(targets, predictions)
    
    return rmse, r2

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64
num_epochs = 30
latent_dim = 256

# Create datasets and dataloaders
train_dataset = EnrollmentDataset(X_train_text, X_train_num, y_train)
test_dataset = EnrollmentDataset(X_test_text, X_test_num, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model
model = EnrollmentModel(X_train_text.shape[1], X_train_num.shape[1], latent_dim).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [14]:
# Training loop
best_rmse = float('inf')
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_rmse, val_r2 = validate(model, test_loader, device)
    
    scheduler.step(val_rmse)
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model.pt')
    
    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val RMSE: {val_rmse:.4f}, Val R2: {val_r2:.4f}')

Epoch 1/30:
Train Loss: 174.6086
Val RMSE: 14.2235, Val R2: 0.2452
Epoch 2/30:
Train Loss: 122.2501
Val RMSE: 13.1322, Val R2: 0.3566
Epoch 3/30:
Train Loss: 115.2578
Val RMSE: 12.9615, Val R2: 0.3732
Epoch 4/30:
Train Loss: 111.8931
Val RMSE: 12.8940, Val R2: 0.3797
Epoch 5/30:
Train Loss: 107.7375
Val RMSE: 12.7003, Val R2: 0.3982
Epoch 6/30:
Train Loss: 105.5436
Val RMSE: 12.6941, Val R2: 0.3988
Epoch 7/30:
Train Loss: 102.1707
Val RMSE: 12.9951, Val R2: 0.3699
Epoch 8/30:
Train Loss: 99.4927
Val RMSE: 12.6687, Val R2: 0.4012
Epoch 9/30:
Train Loss: 96.5164
Val RMSE: 12.9885, Val R2: 0.3706
Epoch 10/30:
Train Loss: 92.8839
Val RMSE: 12.7272, Val R2: 0.3957
Epoch 11/30:
Train Loss: 90.8528
Val RMSE: 12.6369, Val R2: 0.4042
Epoch 12/30:
Train Loss: 87.0073
Val RMSE: 12.6129, Val R2: 0.4065
Epoch 13/30:
Train Loss: 83.4892
Val RMSE: 12.9036, Val R2: 0.3788
Epoch 14/30:
Train Loss: 79.8507
Val RMSE: 12.7793, Val R2: 0.3907
Epoch 15/30:
Train Loss: 77.0928
Val RMSE: 13.3956, Val R2: 0.33

In [15]:
model.eval()
with torch.no_grad():
    reduced_train_text = model.text_encoder(train_dataset.text_features.to(device)).cpu().numpy()
    reduced_test_text = model.text_encoder(test_dataset.text_features.to(device)).cpu().numpy()

In [16]:
model.eval()

EnrollmentModel(
  (text_encoder): TextEncoder(
    (encoder): Sequential(
      (0): Linear(in_features=10180, out_features=1024, bias=True)
      (1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=1024, out_features=512, bias=True)
      (5): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=512, out_features=256, bias=True)
    )
  )
  (text_decoder): TextDecoder(
    (decoder): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=512, out_features=1024, bias=True)
      (5): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=1024

In [17]:
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, adjusted_rand_score
import joblib
import cupy as cp  # For GPU support
import pandas as pd
import numpy as np

In [19]:
print(reduced_train_text.shape, X_train_num.shape)

(55168, 256) (55168, 48)


In [21]:
def adjusted_r2_score(r2, n_samples, n_features):
    """Calculate adjusted R2 score"""
    return 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)

X_trian_combined = None
X_test_combined = None

X_train_combined = hstack([csr_matrix(reduced_train_text), csr_matrix(X_train_num)])
X_test_combined = hstack([csr_matrix(reduced_test_text), csr_matrix(X_test_num)])

In [22]:
print(X_train_combined.shape, X_test_combined.shape)

(55168, 304) (13792, 304)


In [24]:
X_train_combined = X_train_combined.astype('float32')
X_test_combined = X_test_combined.astype('float32')

In [25]:
# import pickle

# # Save the arrays using pickle
# with open("X_train_combined.pkl", "wb") as f:
#     pickle.dump(X_train_combined, f)

# with open("X_test_combined.pkl", "wb") as f:
#     pickle.dump(X_test_combined, f)

# with open("y_train.pkl", "wb") as f:
#     pickle.dump(y_train, f)

# with open("y_test.pkl", "wb") as f:
#     pickle.dump(y_test, f)

# print("Data saved using pickle.")

Data saved using pickle.


In [None]:
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, adjusted_rand_score
import joblib
import pandas as pd
import numpy as np

def adjusted_r2_score(r2, n_samples, n_features):
    """Calculate adjusted R2 score"""
    return 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)

def compute_scores(model, X_train, X_test, y_train, y_test, feature_set, use_gpu=True):
    if use_gpu and isinstance(model, XGBRegressor):
        model.set_params(tree_method='gpu_hist')  # Enable GPU acceleration for XGBoost
    
    # Convert to dense array if sparse
    X_train = X_train.toarray() if hasattr(X_train, 'toarray') else X_train
    X_test = X_test.toarray() if hasattr(X_test, 'toarray') else X_test
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Calculate adjusted R2
    n_train_samples, n_features = X_train.shape
    n_test_samples = X_test.shape[0]
    
    train_adj_r2 = adjusted_r2_score(train_r2, n_train_samples, n_features)
    test_adj_r2 = adjusted_r2_score(test_r2, n_test_samples, n_features)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    return {
        "Feature Set": feature_set,
        "Train R2": train_r2,
        "Test R2": test_r2,
        "Train Adj R2": train_adj_r2,
        "Test Adj R2": test_adj_r2,
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse
    }

# Model parameters

xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1  # Use all CPU cores
}

xgb_model = XGBRegressor(**xgb_params)

# Placeholder for results
results = []

# 3. Combined Features
print("Training models with combined features...")

results.append(compute_scores(xgb_model, X_train_combined, X_test_combined, 
                            y_train, y_test, "Combined"))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, 
                         index=["XGB - Combined"])

# Save the results
results_df.to_csv('model_comparison_results.csv')
print("\nModel Comparison Results:")
print(results_df)

# Save the best model (assuming XGBoost with combined features performs best)
best_model = xgb_model
joblib.dump(best_model, 'best_model_ml.joblib')

# Feature importance for XGBoost (for combined features)
if isinstance(best_model, XGBRegressor):
    feature_importance = pd.DataFrame({
        'feature': range(X_train_combined.shape[1]),
        'importance': best_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(20))