In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

In [67]:
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets 
  
# metadata 
print(abalone.metadata) 
# variable information 
print(abalone.variables) 

# Combine features and targets into one DataFrame
df = pd.concat([X, y], axis=1)
df

{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [68]:
dataset = df[df.columns[0:7]]
dataset['Sex_1'] = np.where(dataset['Sex']=='M',1,-1)
dataset['Sex_2'] = np.where(dataset['Sex']=='F',1,-1)
dataset = dataset[dataset.columns[1:9]]
dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Sex_1'] = np.where(dataset['Sex']=='M',1,-1)


Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Sex_1,Sex_2
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,1,-1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,1,-1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,-1,1
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,1,-1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,-1,-1
...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,-1,1
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,1,-1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,1,-1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,-1,1


In [69]:
dataset.describe()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Sex_1,Sex_2
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,-0.268374,-0.374192
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.96343,0.927462
min,0.075,0.055,0.0,0.002,0.001,0.0005,-1.0,-1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,-1.0,-1.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,-1.0,-1.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,1.0,1.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.0,1.0


In [70]:
dataset['Sex_1'] = pd.to_numeric(dataset['Sex_1'])
dataset['Sex_2'] = pd.to_numeric(dataset['Sex_2'])
# Normaliser les caractéristiques
#dataset[dataset.columns[0:7]] = (dataset[dataset.columns[0:7]] - dataset[dataset.columns[0:7]].mean()) / dataset[dataset.columns[0:7]].std()

train_X, test_X = train_test_split(
    dataset[dataset.columns[0:8]].values,
    test_size=0.2
)

# Conversion en tensors PyTorch
train_X = torch.tensor(train_X, dtype=torch.float32)
test_X = torch.tensor(test_X, dtype=torch.float32)

In [71]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # Encodeur
        self.encoder = nn.Sequential(
            nn.Linear(8, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )
        # Décodeur
        self.decoder = nn.Sequential(
            nn.Linear(2, 10),
            nn.ReLU(),
            nn.Linear(10, 50),
            nn.ReLU(),
            nn.Linear(50, 100),
            nn.ReLU(),
            nn.Linear(100, 8),  # Taille de l'entrée
            nn.Sigmoid()
        )

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return decoded

In [72]:

autoencoder = Autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)

for epoch in range(5000):
    optimizer.zero_grad()
    output = autoencoder(train_X)
    loss = criterion(output, train_X)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 5000, loss.item()))

# Test du modèle
with torch.no_grad():
    output = autoencoder(test_X)
    loss = criterion(output, test_X)
    print('Test Loss: {:.4f}'.format(loss.item()))

    reconstructed = output.numpy()
    original = test_X.numpy()

print('Mean Squared Error:', mean_squared_error(original, reconstructed))

Epoch [1/5000], Loss: 0.4765
Epoch [101/5000], Loss: 0.2890
Epoch [201/5000], Loss: 0.2612
Epoch [301/5000], Loss: 0.2608
Epoch [401/5000], Loss: 0.2606
Epoch [501/5000], Loss: 0.2606
Epoch [601/5000], Loss: 0.2605
Epoch [701/5000], Loss: 0.2605
Epoch [801/5000], Loss: 0.2604
Epoch [901/5000], Loss: 0.2604
Epoch [1001/5000], Loss: 0.2604
Epoch [1101/5000], Loss: 0.2604
Epoch [1201/5000], Loss: 0.2604
Epoch [1301/5000], Loss: 0.2603
Epoch [1401/5000], Loss: 0.2603
Epoch [1501/5000], Loss: 0.2603
Epoch [1601/5000], Loss: 0.2603
Epoch [1701/5000], Loss: 0.2603
Epoch [1801/5000], Loss: 0.2603
Epoch [1901/5000], Loss: 0.2603
Epoch [2001/5000], Loss: 0.2603
Epoch [2101/5000], Loss: 0.2603
Epoch [2201/5000], Loss: 0.2603
Epoch [2301/5000], Loss: 0.2603
Epoch [2401/5000], Loss: 0.2602
Epoch [2501/5000], Loss: 0.2602
Epoch [2601/5000], Loss: 0.2602
Epoch [2701/5000], Loss: 0.2602
Epoch [2801/5000], Loss: 0.2602
Epoch [2901/5000], Loss: 0.2602
Epoch [3001/5000], Loss: 0.2602
Epoch [3101/5000], L

Avec encodage du Sex en 0,1,2 on avant 0,69 en perte
Avec l'encodage à 2 colonne en 1,-1 on a une perte à 0.2627