In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Using a public URL for the Titanic dataset for reproducibility
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [2]:
# Drop columns that are not useful for this task
df = df.drop(['Name', 'Survived'], axis=1)

# Define categorical and numerical features
categorical_features = ['Sex', 'Pclass']
numerical_features = ['Age', 'Fare', 'Parents/Children Aboard', 'Siblings/Spouses Aboard']

# Create preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor

In [3]:
# Fit the preprocessor and transform the data
X = preprocessor.fit_transform(df)
X_tensor = torch.FloatTensor(X)
X_tensor

tensor([[-0.5294, -0.5036, -0.4750,  ...,  0.0000,  0.0000,  1.0000],
        [ 0.6043,  0.7834, -0.4750,  ...,  1.0000,  0.0000,  0.0000],
        [-0.2460, -0.4900, -0.4750,  ...,  0.0000,  0.0000,  1.0000],
        ...,
        [-1.5921, -0.1780,  2.0033,  ...,  0.0000,  0.0000,  1.0000],
        [-0.2460, -0.0463, -0.4750,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.1792, -0.4935, -0.4750,  ...,  0.0000,  0.0000,  1.0000]])

In [4]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_space):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, latent_space),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_space, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [5]:
# Initialize model, loss, and optimizer
input_dim = X.shape[1]
model = Autoencoder(input_dim, latent_space=8)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=64, out_features=8, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=128, out_features=9, bias=True)
    (5): Tanh()
  )
)

In [6]:
# Train the model
num_epochs = 100
batch_size = 32
for epoch in range(num_epochs):
    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]
        # Forward pass
        output = model(batch)
        loss = criterion(output, batch)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.1298
Epoch [20/100], Loss: 0.1237
Epoch [30/100], Loss: 0.1232
Epoch [40/100], Loss: 0.1230
Epoch [50/100], Loss: 0.1225
Epoch [60/100], Loss: 0.1224
Epoch [70/100], Loss: 0.1222
Epoch [80/100], Loss: 0.1226
Epoch [90/100], Loss: 0.1223
Epoch [100/100], Loss: 0.1226


In [7]:
# Get reconstructed data
with torch.no_grad():
    reconstructed_data_tensor = model(X_tensor)
reconstructed_data = reconstructed_data_tensor.numpy()
reconstructed_data

array([[-0.49555537, -0.51738155, -0.46044463, ..., -0.00507437,
        -0.0062264 ,  0.9951619 ],
       [ 0.55999035,  0.8088607 , -0.4251984 , ...,  0.99715763,
        -0.03242003, -0.02810952],
       [-0.17564349, -0.51931876, -0.48136795, ..., -0.01185838,
        -0.00750191,  0.9925494 ],
       ...,
       [-0.9979254 , -0.12428381,  0.99950486, ..., -0.0250023 ,
        -0.03085991,  0.99409604],
       [-0.25736806, -0.07409111, -0.4564349 , ...,  0.9828583 ,
        -0.00988364, -0.00486177],
       [ 0.20307891, -0.50322723, -0.46404886, ..., -0.00329283,
        -0.02214616,  0.9924158 ]], shape=(887, 9), dtype=float32)

In [8]:
# Correctly inverse transform the data
num_features_len = len(numerical_features)
reconstructed_num = reconstructed_data[:, :num_features_len]
reconstructed_cat = reconstructed_data[:, num_features_len:]

original_num = X[:, :num_features_len]
original_cat = X[:, num_features_len:]

reconstructed_num_inv = preprocessor.named_transformers_['num'].named_steps['scaler'].inverse_transform(reconstructed_num)
reconstructed_cat_inv = preprocessor.named_transformers_['cat'].named_steps['onehot'].inverse_transform(reconstructed_cat)

original_num_inv = preprocessor.named_transformers_['num'].named_steps['scaler'].inverse_transform(original_num)
original_cat_inv = preprocessor.named_transformers_['cat'].named_steps['onehot'].inverse_transform(original_cat)

reconstructed_data_inv = np.hstack((reconstructed_num_inv, reconstructed_cat_inv))
original_data_inv = np.hstack((original_num_inv, original_cat_inv))

# Create DataFrames for comparison
df_original = pd.DataFrame(original_data_inv, columns=numerical_features + categorical_features)
df_reconstructed = pd.DataFrame(reconstructed_data_inv, columns=numerical_features + categorical_features)

# Display a sample of original and reconstructed data
print("\nOriginal Data (sample):")
df_original.head()


Original Data (sample):


Unnamed: 0,Age,Fare,Parents/Children Aboard,Siblings/Spouses Aboard,Sex,Pclass
0,22.0,7.25,0.0,1.0,male,3
1,38.0,71.2833,0.0,1.0,female,1
2,26.0,7.925,0.0,0.0,female,3
3,35.0,53.1,0.0,1.0,female,1
4,35.0,8.05,0.0,0.0,male,3


In [9]:
print("\nReconstructed Data (sample):")
df_reconstructed.head()


Reconstructed Data (sample):


Unnamed: 0,Age,Fare,Parents/Children Aboard,Siblings/Spouses Aboard,Sex,Pclass
0,22.477201,6.563633,0.011731,1.015214,male,3
1,37.375114,72.549454,0.040175,1.054749,female,1
2,26.99242,6.467249,-0.005154,0.029031,female,3
3,34.822247,51.00182,0.034412,1.051863,female,1
4,35.349232,7.527668,0.011574,0.004345,male,3
