Import Libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim

Load Data

In [29]:
# Load the dataset
data = pd.read_csv("/workspaces/deep-learning-codespaces-Mshamas/data/data.csv")
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


Preprocessing

In [30]:
# Drop the 'Model' column
data = data.drop(['Model'], axis=1)
data = data.dropna()
data.head()

Unnamed: 0,Make,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [31]:
# Splitting the dataset into features and target variable
Y = data['MSRP']
X = data.drop(['MSRP'], axis=1)

In [32]:
# Scaling the target variable Y
scaler_Y = StandardScaler()
Y = scaler_Y.fit_transform(Y.values.reshape(-1, 1)).flatten()

In [33]:
# Assume 'X_original' is your original DataFrame before preprocessing
X_original = X.copy()  # Make a copy of the original DataFrame to preserve it

# Extract feature names before any transformation
feature_names_numerical = list(X_original.select_dtypes(include='number').columns)

In [34]:
# Define custom pipelines for numerical and categorical features
numerical_preprocessing = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

categorical_preprocessing = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Function for logarithmic transformation
def log_transform(x):
    return np.log(x + 1)

# Pipeline for log transformation
log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(log_transform)),
    ('scaler', StandardScaler())
])

# Full preprocessing pipeline
preprocessing = ColumnTransformer([
        ("log", log_pipeline, ["Year", "Engine Cylinders", "Number of Doors", "highway MPG", "city mpg", "Popularity"]),
        ("cat", categorical_preprocessing, make_column_selector(dtype_include=object))],
    remainder='passthrough'
)

# Apply the preprocessing pipeline to the features
X_preprocessed = preprocessing.fit_transform(X_original)

# Convert the csr_matrix to a dense numpy array if necessary
X_preprocessed_dense = X_preprocessed.toarray() if hasattr(X_preprocessed, "toarray") else X_preprocessed

# Generate feature names for the transformed categorical features
feature_names_categorical = preprocessing.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out()

# Combine the numerical and new categorical feature names
# Note: Adjust 'feature_names_numerical' as necessary if you're transforming some of them (e.g., log transform)
feature_names = feature_names_numerical + list(feature_names_categorical)

# Create a DataFrame with these feature names
X = pd.DataFrame(X_preprocessed_dense, columns=feature_names)

Spliting Dataset

In [35]:
from sklearn.model_selection import train_test_split

# Assuming X_preprocessed and Y_scaled are your features and targets after preprocessing
X_train_temp, X_test, Y_train_temp, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

# Split the temporary training data into actual training data and validation data
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train_temp, Y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [36]:
print(X_train.shape)

(4850, 160)


In [37]:
print(Y_train.shape)

(4850,)


FNN

In [43]:
# Set the correct dimensions based on preprocessing
input_size = 160  # Number of features after preprocessing
hidden_size = 128  
output_size = 1  # For regression tasks

# Define the neural network architecture
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        return x

# Initialize the model, loss function, and optimizer
model = FNN(input_size, hidden_size, output_size)
loss_function = nn.MSELoss()  # Assuming a regression task; adjust as necessary
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [39]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Directly convert X pandas DataFrames to PyTorch tensors
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)

# Since Y is already a numpy array and flattened, convert them directly without .to_numpy() or .view(-1, 1)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32).view(-1, 1)  # Use .view(-1, 1) if you want to ensure a 2D tensor
Y_val_tensor = torch.tensor(Y_val, dtype=torch.float32).view(-1, 1)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32).view(-1, 1)

# Wrap these tensors in TensorDataset
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

# Define the DataLoader for each dataset
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

Training + Validating model 

In [44]:
num_epochs = 20

# Training Loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Reset the gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(features)
        
        # Compute the loss
        loss = loss_function(predictions, targets)
        running_loss += loss.item()
        
        # Backward pass and update weights
        loss.backward()
        optimizer.step()

    # Print average training loss per epoch
    print(f"Epoch {epoch+1}, Training Loss: {running_loss/len(train_loader):.4f}")

# Model Evaluation with Validation Set
model.eval()  # Set the model to evaluation mode
total_val_loss = 0.0
with torch.no_grad():
    for features, targets in val_loader:
        predictions = model(features)
        val_loss = loss_function(predictions, targets)
        total_val_loss += val_loss.item()

    # Print average validation loss
    print(f"Validation Loss: {total_val_loss/len(val_loader):.4f}")


Epoch 1, Training Loss: 2207.0504
Epoch 2, Training Loss: 1.3597
Epoch 3, Training Loss: 1.3336
Epoch 4, Training Loss: 1.3067
Epoch 5, Training Loss: 1.2870
Epoch 6, Training Loss: 1.2671
Epoch 7, Training Loss: 1.2527
Epoch 8, Training Loss: 1.2389
Epoch 9, Training Loss: 1.2281
Epoch 10, Training Loss: 1.2177
Epoch 11, Training Loss: 1.2107
Epoch 12, Training Loss: 1.2028
Epoch 13, Training Loss: 1.1958
Epoch 14, Training Loss: 1.1909
Epoch 15, Training Loss: 1.1872
Epoch 16, Training Loss: 1.1831
Epoch 17, Training Loss: 1.1785
Epoch 18, Training Loss: 1.1968
Epoch 19, Training Loss: 1.1742
Epoch 20, Training Loss: 1.1721
Validation Loss: 0.5957


Model evaluation on test set

In [45]:
import numpy as np
import torch

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    total_loss = 0
    for features, targets in test_loader:
        predictions = model(features)
        # Ensure the predictions shape matches the targets shape
        if predictions.dim() > 1 and predictions.shape[1] == 1:
            predictions = predictions.squeeze(1)  # Remove singleton dimensions if predictions are in shape [n, 1]
        loss = loss_function(predictions, targets)
        total_loss += loss.item()

    # Calculate RMSE
    rmse = np.sqrt(total_loss / len(test_loader))
    print(f"Test RMSE: {rmse}")


Test RMSE: 0.951002808058975


Model Hypertuning

In [46]:
import torch.optim as optim
import numpy as np
import copy

def train_and_evaluate_model(model, train_loader, val_loader, optimizer, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()
        for features, targets in train_loader:
            optimizer.zero_grad()
            predictions = model(features)
            loss = loss_function(predictions, targets)
            loss.backward()
            optimizer.step()

    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for features, targets in val_loader:
            predictions = model(features)
            val_loss = loss_function(predictions, targets)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss

# Hyperparameters to tune
learning_rates = [1e-5, 5e-5, 1e-4]
batch_sizes = [32, 64, 128]

best_val_loss = float('inf')
best_lr = None
best_batch_size = None
best_model = None

for lr in learning_rates:
    for batch_size in batch_sizes:
        # Prepare DataLoaders with the current batch size
        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
        
        # Initialize the model and optimizer with the current learning rate
        model = FNN(input_size, hidden_size, output_size)
        optimizer = optim.SGD(model.parameters(), lr=lr)
        
        # Train the model and evaluate on the validation set
        val_loss = train_and_evaluate_model(model, train_loader, val_loader, optimizer, num_epochs=20)
        
        # Update best hyperparameters if current combination is better
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_lr = lr
            best_batch_size = batch_size
            best_model = copy.deepcopy(model)

print(f"Best Learning Rate: {best_lr}, Best Batch Size: {best_batch_size}, Best Validation Loss: {best_val_loss}")

Best Learning Rate: 5e-05, Best Batch Size: 32, Best Validation Loss: 0.49869565008317723
