## Imports & Settings

In [None]:
# Importing necessary functions from custom modules
from modules.data_preparation import data_to_image, data_to_heatmap_bw   # Function to convert data to images
from modules.model import model_res                  # Pre-defined model architecture
from modules.train import train_model                # Function to train the model
from modules.data_cleaning import clean_data         # Function to clean and preprocess data
from modules.basics import *                         # Other utility functions

# Disabling cudnn for deterministic results
torch.backends.cudnn.enabled = False

# Selecting the device for training (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading the autoreload extension to automatically reload modules
%load_ext autoreload

# Setting autoreload to automatically reload all modules
%autoreload 2

# Setting matplotlib to display plots inline in the Jupyter Notebook
%matplotlib inline

# Setting the figure format for inline plotting to 'retina' for better quality
%config InlineBackend.figure_format = 'retina'


In [None]:
import warnings
warnings.filterwarnings('ignore')

## Data Preparation

In [None]:
df = pd.read_csv('../data/drinking_water_potability.csv')
df.tail(5)

# Limpieza y normalización de datos

In [None]:
df_cleaned = clean_data(df)
df_cleaned

In [None]:
sns.countplot(x='Potability', data=df_cleaned)
plt.show()

## Balanceo de datos

In [None]:
"""
# Selecting features (independent variables) and the target variable (dependent variable)
features = df_cleaned.drop('Potability', axis=1)
labels = df_cleaned['Potability']

# Creating arrays for features and labels
features_space = features
labels_space = labels.values

# Selecting 200 random samples from the dataset
random_values = features_space.sample(n=100)
X_sample = random_values.values
y_sample = labels_space[random_values.index]

# Instantiating the RandomOverSampler with random_state=0
oversampler = RandomOverSampler(random_state=0)

# Performing random oversampling to balance the dataset
X_resampled, y_resampled = oversampler.fit_resample(X_sample, y_sample)
"""

In [None]:

# Importar RandomOverSampler de la biblioteca imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Seleccionar características (variables independientes) y la variable objetivo (variable dependiente)
features = df_cleaned.drop('Potability', axis=1)
labels = df_cleaned['Potability']

# Crear arrays para características y etiquetas
features_space = features.values
labels_space = labels.values

# Instanciar el RandomOverSampler con random_state=0
oversampler = RandomOverSampler(random_state=0)

# Realizar el sobremuestreo aleatorio para equilibrar el conjunto de datos completo
X_resampled, y_resampled = oversampler.fit_resample(features_space, labels_space)


----------

# Dividir los datos en conjuntos de entrenamiento y prueba

In [None]:
# Rounding the values in X_resampled to 2 decimal places
X_resampled_rounded = np.round(X_resampled, 2)

# Splitting the rounded data into training and validation sets
# test_size=0.2 specifies that 20% of the data will be used for validation
# random_state=42 sets the random seed for reproducibility
X_train, X_val, y_train, y_val = train_test_split(X_resampled_rounded, y_resampled, test_size=0.2, random_state=42)

# Printing the shapes of the training and validation sets
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)


# Model Implementation

In [None]:
# Converting the numerical training data into images
train_images = data_to_image(X_train)

# Converting the numerical validation data into images
val_images = data_to_image(X_val)

In [None]:
"""
import os
import matplotlib.pyplot as plt

# Crear una carpeta para guardar las imágenes si no existe
if not os.path.exists('images_saved'):
    os.makedirs('images_saved')

# Iterar sobre el índice de las imágenes
for i in range(len(train_images)):
    # Mostrar y guardar la imagen actual
    plt.imshow(train_images[i][0, :, :])
    plt.savefig(f'images_saved/stml_{i}.png')
    plt.close()  # Cerrar la figura para evitar que se superpongan las imágenes

"""


In [None]:
print(train_images.shape)
print(val_images.shape)
plt.grid()
plt.imshow(train_images[12][0, :, :])

plt.savefig('imagen_guardada.png')

----------

In [None]:
# Converting the training images to PyTorch tensors
X_train_I = torch.from_numpy(train_images).float()

# Converting the training labels to PyTorch tensors
y_train_I = torch.from_numpy(y_train).long()

# Converting the validation images to PyTorch tensors
X_val_I = torch.from_numpy(val_images).float()

# Converting the validation labels to PyTorch tensors
y_val_I = torch.from_numpy(y_val).long()

In [None]:
# Creating PyTorch datasets for training and validation
train_dataset = TensorDataset(X_train_I, y_train_I)
val_dataset = TensorDataset(X_val_I, y_val_I)

# Creating data loaders for training and validation sets
# The DataLoader class provides an iterable over the dataset, with optional shuffling and batching
dataloaders = {'train': DataLoader(train_dataset, batch_size=32, shuffle=True),
               'val': DataLoader(val_dataset)}

# Storing the sizes of the training and validation datasets
dataset_sizes = {'train': len(X_train),
                 'val': len(X_val)}

## Modeling: By [AHN MINJAE](https://github.com/EmjayAhn/SuperTML-pytorch)
- Transfer Learning from Resnet
- I changed just fully connect layer at the end to 3 outputs

In [None]:
# Moving the pre-defined model to the specified device (GPU if available, otherwise CPU)
model = model_res.to(device)

# Defining the loss function (cross-entropy loss)
criterion = nn.CrossEntropyLoss()

# Defining the optimizer (Adam optimizer) and passing model parameters to be optimized
optimizer = optim.Adam(model.parameters())

In [None]:
best_model = train_model(model, dataloaders, dataset_sizes, criterion, optimizer, device, 20)

# Model Hdd

In [None]:
# Convertir los datos a mapas de calor en escala de grises
train_heatmaps_bw = data_to_heatmap_bw(X_train)
val_heatmaps_bw = data_to_heatmap_bw(X_val)

In [None]:
print('Forma de train_heatmaps_bw:', train_heatmaps_bw.shape)
print('Forma de val_heatmaps_bw:', val_heatmaps_bw.shape)

In [None]:
# Converting the training images to PyTorch tensors
X_train_II = torch.from_numpy(train_heatmaps_bw).float()

# Converting the training labels to PyTorch tensors
y_train_II = torch.from_numpy(y_train).long()

# Converting the validation images to PyTorch tensors
X_val_II = torch.from_numpy(val_heatmaps_bw).float()

# Converting the validation labels to PyTorch tensors
y_val_II = torch.from_numpy(y_val).long()

In [None]:
X_train_II = torch.from_numpy(train_heatmaps_bw).float()
X_val_II = torch.from_numpy(val_heatmaps_bw).float()

In [None]:
train_heatmaps_bw = np.mean(train_heatmaps_bw, axis=3)
val_heatmaps_bw = np.mean(val_heatmaps_bw, axis=3)

In [None]:
print('Tamaño de X_train_II - batch_size:', X_train_II.size(0))
print('Tamaño de X_train_II - canales:', X_train_II.size(1))
print('Tamaño de X_train_II - altura:', X_train_II.size(2))
print('Tamaño de X_train_II - ancho:', X_train_II.size(3))

## Model: CNN

**1.  Preparar los datos para PyTorch:** Convierte los mapas de calor a tensores de PyTorch y crea conjuntos de datos y cargadores de datos para el entrenamiento y la validación.

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convertir los mapas de calor a tensores de PyTorch
X_train_tensor = torch.from_numpy(train_heatmaps_bw).unsqueeze(1).float()
X_val_tensor = torch.from_numpy(val_heatmaps_bw).unsqueeze(1).float()

# Crear conjuntos de datos y cargadores de datos
train_dataset = TensorDataset(X_train_tensor, y_train_II)
val_dataset = TensorDataset(X_val_tensor, y_val_II)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


**2. Definir el modelo CNN:** Define la arquitectura de tu modelo CNN en PyTorch.

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Definir las capas convolucionales y de pooling
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        # Definir las capas completamente conectadas
        self.fc1 = nn.Linear(32 * 120 * 160, 128)
        self.fc2 = nn.Linear(128, 2)  # 2 clases (potable, no potable)

    def forward(self, x):
        # Aplicar convoluciones y funciones de activación
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        # Aplanar la salida de las capas convolucionales
        x = x.view(-1, 32 * 120 * 160)
        # Aplicar capas completamente conectadas
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instanciar el modelo
model = SimpleCNN()


**3. Entrenar el modelo:** Entrena el modelo utilizando los conjuntos de datos y cargadores de datos que has creado.

In [None]:
import time  # Importa el módulo time
import torch.optim as optim
import torch.nn as nn

# Supongo que model, train_loader, y val_loader ya están definidos

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20
best_val_accuracy = 0.0  # Variable para rastrear el mejor rendimiento de validación

for epoch in range(epochs):
    start_time = time.time()  # Inicia el contador de tiempo al inicio de la época
    
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss /= len(train_loader)
    train_accuracy = correct_train / total_train

    # Validación del modelo
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = correct_val / total_val

    epoch_time = time.time() - start_time  # Calcula el tiempo transcurrido al final de la época

    print(f'EPOCH {epoch+1}/{epochs}:')
    print('-' * 10)
    print(f'Train Loss: {train_loss:.4f} Acc: {train_accuracy:.4f}')
    print(f'Val Loss: {val_loss:.4f} Acc: {val_accuracy:.4f}')
    print(f'Epoch Time: {epoch_time:.2f} seconds\n')  # Imprime el tiempo transcurrido

    # Actualización del mejor rendimiento de validación
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy

print(f'BEST VALIDATION ACCURACY: {best_val_accuracy:.4f}')


## Model: RNN - En proceso :v (No sirve)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from transformers import ViTFeatureExtractor, ViTForImageClassification

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Cambiar el tamaño de las imágenes a (224, 224)
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalización estándar
])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
train_dataset = CustomDataset(X_train, y_train, transform=transform)
val_dataset = CustomDataset(X_val, y_val, transform=transform)

In [None]:
# Cargar el extractor de características y el modelo ViT preentrenado
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')

In [None]:
epochs = 5
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)['logits']
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validación del modelo
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)['logits']
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = correct / total
    print(f'Epoch [{epoch+1}/{epochs}], Validation Accuracy: {val_accuracy:.4f}')

# **Model T1**


## Images GG - ViT

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import torch
import torchvision.transforms as transforms
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
import warnings

In [None]:
# Conversión de datos a imágenes en escala de grises con 3 canales
def gray_gang_gg(data):
    cell_size = 85
    rgb_images = []
    for row in data:
        image = np.zeros((255, 255, 3), dtype=np.uint8)
        for i, value in enumerate(row):
            x = (i % 3) * cell_size
            y = (i // 3) * cell_size
            grayscale_value = int(value * 255)
            image[y:y+cell_size, x:x+cell_size] = [grayscale_value] * 3
        rgb_images.append(image)
    return np.array(rgb_images)

In [None]:
train_images_gg = gray_gang_gg(X_train)
val_images_gg = gray_gang_gg(X_val)

In [None]:
image = train_images_gg[1]

if isinstance(image, torch.Tensor):
    image = image.numpy()

if image.shape[0] == 3:  
    image = np.transpose(image, (1, 3, 0))
    
plt.imshow(image)
plt.title("Imagen número 2")
plt.axis('off') 
plt.show()

Crear un Dataset personalizado para PyTorch

In [None]:
class PotabilityDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
# Transformaciones para el dataset
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [None]:
train_dataset = PotabilityDataset(train_images_gg, y_train, transform=transform)
val_dataset = PotabilityDataset(val_images_gg, y_val, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

Definir y entrenar el modelo ViT

In [None]:
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=2)
model.to(device)

In [None]:
# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
import time

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    best_val_accuracy = 0.0  # Variable para almacenar la mejor precisión de validación
    for epoch in range(num_epochs):
        start_time = time.time()
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device, dtype=torch.long)
            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"Train Loss: {running_loss/len(train_loader)}")

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device, dtype=torch.long)
                outputs = model(images).logits
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            val_accuracy = correct / total
            print(f"Validation Accuracy: {val_accuracy}")
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy

        print('-' * 10)

    print("Training completed.")
    print(f"BEST VALIDATION ACCURACY: {best_val_accuracy:.4f}")

In [None]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=1)

## Images with PCA - ResNet18

### ResNet18

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image, ImageDraw
import numpy as np
from sklearn.decomposition import PCA


In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

from sklearn.decomposition import PCA
from PIL import Image, ImageDraw, ImageFont
import numpy as np

In [None]:
# PCA Function
def data_to_image_pca(data):
    data_images = []
    num_components = 3  # Number of principal components
    pca = PCA(n_components=num_components)
    data_pca = pca.fit_transform(data)

    for dat in data_pca:
        scaled_dat = ((dat - dat.min()) / (dat.max() - dat.min())) * 255
        image = Image.new("RGB", (255, 255))
        draw = ImageDraw.Draw(image)
        for i in range(len(scaled_dat)):
            x = scaled_dat[i]
            y = 150  # Adjust this vertical position as needed
            draw.rectangle([x-1, y-1, x+1, y+1], fill='white')
        data_images.append(np.array(image))
    
    return np.array(data_images)

In [None]:
train_images = data_to_image_pca(X_train)
val_images = data_to_image_pca(X_val)

In [None]:
class PotabilityDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
transform = transforms.Compose([
    transforms.ToTensor()
])

In [None]:
train_dataset = PotabilityDataset(train_images, y_train, transform=transform)
val_dataset = PotabilityDataset(val_images, y_val, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 2)  # Assuming binary classification
model.to(device)

In [None]:
# Modelo de transformers
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.to(device)

In [None]:
# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
def train_model_1(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    best_val_accuracy = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"Train Loss: {running_loss/len(train_loader)}")

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            val_accuracy = correct / total
            print(f"Validation Accuracy: {val_accuracy}")
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy

        print('-' * 10)

    print("Training completed.")
    print(f"BEST VALIDATION ACCURACY: {best_val_accuracy:.4f}")

In [None]:
train_model_1(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

### ViT

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image, ImageDraw
import numpy as np
from sklearn.decomposition import PCA


In [None]:
train_images = data_to_image_pca(X_train)
val_images = data_to_image_pca(X_val)

In [None]:
class PotabilityDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

# Define the transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),  # ViT requires images of size 224x224
    transforms.ToTensor()
])

train_dataset = PotabilityDataset(train_images, y_train, transform=transform)
val_dataset = PotabilityDataset(val_images, y_val, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=2)  # Assuming binary classification
model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
train_model_1(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)