In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim

In [None]:
!pip install pytorch-lightning wandb

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.0.post0-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.9/800.9 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m25.5 MB/s[0m eta [3

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import wandb
from torch.utils.data import DataLoader, Dataset

In [None]:
# Define a PyTorch Dataset for your data
class CustomDataset(Dataset):
    def __init__(self, dataframe, label_col, scaler=None):
        self.features = dataframe.drop(label_col, axis=1).values
        self.labels = dataframe[label_col].values
        self.scaler = scaler

        if self.scaler is not None:
              self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

# Define a simple linear regression model
class RegressionModel(pl.LightningModule):
    def __init__(self, input_size):
        super(RegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        predictions = self(inputs)
        loss = nn.functional.mse_loss(predictions.view(-1), targets)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        predictions = self(inputs)
        loss = nn.functional.mse_loss(predictions.view(-1), targets)

        y_numpy = targets.detach().cpu().numpy()
        predictions_numpy = predictions.detach().cpu().numpy()
        r2 = r2_score(y_numpy, predictions_numpy)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_r2', r2, prog_bar=True)
        return loss

In [None]:
# Read the data from CSV file
df = pd.read_csv('/content/HF1_train.csv')  # Replace 'your_train_file.csv' with the actual path or filename of your training dataset
df.dropna(axis=0, inplace=True)

# Convert categorical variables to numerical using Label Encoding
le = LabelEncoder()
df['x1'] = le.fit_transform(df['x1'])
df['x3'] = le.fit_transform(df['x3'])
df['x5'] = le.fit_transform(df['x5'])

# Separate features (X) and target variable (y)
X = df.drop('y', axis=1)
y = df['y']
print('original',str(y))

# Assuming X contains your features
feature_scaler = MinMaxScaler()
X_normalized = feature_scaler.fit_transform(X)

# Assuming y contains your target variable
target_scaler = MinMaxScaler()
y_normalized = target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# Replace the original target variable with the normalized one
df['y'] = y_normalized

original 0       12500
1       16500
2       11000
3       16800
4       17300
        ...  
9163    13690
9164    14990
9165    27490
9166    18290
9167    17990
Name: y, Length: 9168, dtype: int64


In [None]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

# Convert y_train and y_val back to Pandas Series before concatenation
y_train = pd.Series(y_train, name='y')
y_val = pd.Series(y_val, name='y')

X_train = pd.DataFrame(X_train, columns=X.columns)
X_val = pd.DataFrame(X_val, columns=X.columns)

print('Y',y_train.head())
print('X',X_train.head())

# Create PyTorch datasets and dataloaders
train_dataset = CustomDataset(pd.concat([X_train, y_train], axis=1), 'y')
val_dataset = CustomDataset(pd.concat([X_val, y_val], axis=1), 'y')

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Initialize the Lightning model
input_size = X_train.shape[1]
model = RegressionModel(input_size)

Y 0    0.143436
1    0.227338
2    0.454641
3    0.171124
4    0.052473
Name: y, dtype: float64
X          x1        x2   x3        x4   x5        x6        x7        x8
0  0.318182  0.941176  0.5  0.057560  1.0  0.250000  0.117751  0.288462
1  0.409091  0.941176  0.0  0.039459  0.0  0.250000  0.111834  0.384615
2  0.636364  1.000000  1.0  0.019726  1.0  0.250000  0.057988  0.557692
3  0.000000  1.000000  1.0  0.032881  1.0  0.250000  0.146746  0.288462
4  0.000000  0.588235  0.5  0.217137  1.0  0.215517  0.201183  0.269231


In [None]:
# Initialize WandB
wandb_logger = pl.loggers.WandbLogger(project="LinearRegressionHF", config=model.hparams)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')  # Early stopping based on validation loss

# Initialize WandB
wandb.init(project="LinearRegressionHF", config=model.hparams)

# Set up the PyTorch Lightning Trainer
trainer = pl.Trainer(
    max_epochs=100,
    log_every_n_steps=1,
    logger=wandb_logger,
    accelerator='gpu',
    callbacks=[early_stopping]
)



VBox(children=(Label(value='0.002 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.13263248847926268, max=1.…

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
# Train the model
trainer.fit(model, train_dataloader, val_dataloader)

# Save the model
torch.save(model.state_dict(), 'regression_model.pth')

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loggers/wandb.py:390: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type   | Params
----------------------------------
0 | linear | Linear | 9     
----------------------------------
9         Trainable params
0         Non-trainable params
9         Total params
0.000     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]