In [None]:
import pandas as pd


In [None]:
train=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment 1/data/train.csv")
test=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment 1/data/test.csv")

In [None]:
#train.columns

# keep only the significant features

selected_features = [
    "OverallQual", "OverallCond", "GrLivArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
    "GarageCars", "GarageArea", "YearBuilt", "YearRemodAdd", "FullBath", "HalfBath",
    "TotRmsAbvGrd", "KitchenQual", "Fireplaces", "LotArea", "Neighborhood", "SalePrice"
]

train = train[selected_features]
test = test[selected_features[:-1]]  # exclude "SalePrice" from test set

In [None]:
train.columns

Index(['OverallQual', 'OverallCond', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GarageCars', 'GarageArea', 'YearBuilt', 'YearRemodAdd',
       'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'KitchenQual', 'Fireplaces',
       'LotArea', 'Neighborhood', 'SalePrice'],
      dtype='object')

In [None]:
# One-hot encoding for categorical variables
train = pd.get_dummies(train, columns=["Neighborhood", "KitchenQual"], drop_first=True)
test = pd.get_dummies(test, columns=["Neighborhood", "KitchenQual"], drop_first=True)

# Check the new structure
print(test.head())

   OverallQual  OverallCond  GrLivArea  TotalBsmtSF  1stFlrSF  2ndFlrSF  \
0            5            6        896        882.0       896         0   
1            6            6       1329       1329.0      1329         0   
2            5            5       1629        928.0       928       701   
3            6            6       1604        926.0       926       678   
4            8            5       1280       1280.0      1280         0   

   GarageCars  GarageArea  YearBuilt  YearRemodAdd  ...  Neighborhood_SWISU  \
0         1.0       730.0       1961          1961  ...               False   
1         1.0       312.0       1958          1958  ...               False   
2         2.0       482.0       1997          1998  ...               False   
3         2.0       470.0       1998          1998  ...               False   
4         2.0       506.0       1992          1992  ...               False   

   Neighborhood_Sawyer  Neighborhood_SawyerW  Neighborhood_Somerst  \
0   

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=["SalePrice"])  # Features
y = train["SalePrice"]  # Target variable

# split to train and validation sets since test set is already there
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {test.shape}")

Training set: (1168, 42), Validation set: (292, 42), Test set: (1459, 42)


In [None]:
# Initialize scalers
X_scaler = StandardScaler()
y_scaler = StandardScaler()

# Fit & transform features (X) and target (y)
X_train_scaled = X_scaler.fit_transform(X_train)
X_val_scaled = X_scaler.transform(X_val)
X_test_scaled = X_scaler.transform(test)  # Apply same transformation to test set

# Scale target variable (SalePrice)
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = y_scaler.transform(y_val.values.reshape(-1, 1))

# **Starting with PyTorch**

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

In [None]:
# Create DataLoader for batch training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print(f"DataLoader ready: {len(train_loader)} batches in training set.")

DataLoader ready: 37 batches in training set.


In [None]:
import torch.nn as nn

class HousePriceMLP(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceMLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),  # Output layer (regression task)
            nn.Dropout(0.3)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# Initialize model
model = HousePriceMLP(X_train.shape[1])
print(model)

HousePriceMLP(
  (model): Sequential(
    (0): Linear(in_features=42, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Dropout(p=0.3, inplace=False)
  )
)


In [None]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.MSELoss() # we have a regression problem so we use MSE
optimizer = optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)  # Reduce LR if val loss plateaus

In [None]:
# Early Stopping Setup
best_val_loss = float("inf")  # Track best validation loss
patience = 10  # Stop training if no improvement after this many epochs
counter = 0  # Track consecutive non-improving epochs

num_epochs = 100

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Reset gradients
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)  # Compute average training loss

    # Evaluate on validation set
    model.eval()
    with torch.no_grad():
        val_predictions = model(X_val_tensor)
        val_loss = criterion(val_predictions, y_val_tensor)  # Compute validation loss

    # Check for improvement in validation loss
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        counter = 0  # Reset counter if validation loss improves
        best_model = model.state_dict()  # Save the best model
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break  # Stop training if no improvement for 'patience' epochs

    # Reduce learning rate if validation loss stops improving
    scheduler.step(val_loss.item())

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {val_loss.item():.4f}")

# Load the best model before final evaluation
model.load_state_dict(best_model)
print("Training complete!")

Epoch [10/100], Loss: 0.2887, Val Loss: 0.2310
Early stopping at epoch 11
Training complete!


In [None]:
# Model evaluation
import numpy as np

model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_predictions = test_predictions.numpy().flatten()

print("Predictions on test set:", test_predictions[:10])  # Print some sample predictions


Predictions on test set: [-0.4731147  -0.26813105 -0.05259355  0.08103102  0.0981495  -0.06756599
 -0.14026311 -0.14293036  0.1444596  -0.55213743]


In [None]:
# Convert the scaled prices back to normal numbers
test_predictions_original = y_scaler.inverse_transform(test_predictions.reshape(-1, 1))

# Print some sample predictions in original house price scale
print("Predictions on test set (original scale):", test_predictions_original[:10].flatten())


Predictions on test set (original scale): [144902.66 160733.64 177379.72 187699.61 189021.67 176223.39 170608.95
 170402.95 192598.23 138799.69]


In [None]:
from sklearn.metrics import mean_squared_error

# Convert validation predictions back to original scale
val_predictions_original = y_scaler.inverse_transform(val_predictions.numpy().flatten().reshape(-1, 1))
y_val_original = y_scaler.inverse_transform(y_val_tensor.numpy().flatten().reshape(-1, 1))

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_val_original, val_predictions_original))
print(f"Validation RMSE: ${rmse:.2f}")


Validation RMSE: $37430.20


In [None]:
# Calculate the average house price to determine if the RMSE is significant
y_train.describe()

Unnamed: 0,SalePrice
count,1168.0
mean,181441.541952
std,77263.583862
min,34900.0
25%,130000.0
50%,165000.0
75%,214925.0
max,745000.0
