In [None]:
!nvidia-smi

In [None]:
#%pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
#%pip install cucim-cu12 cupy-cuda12x

In [None]:
%pip install fastparquet

In [None]:
#%load_ext cudf.pandas

# To desable GPU usage
#import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

#from cucim.skimage.exposure import rescale_intensity
import tensorflow as tf
#import cupy as cp
#import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split
import gc

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # memory limit 16GB (16 * 1024 MB = 16384 MB) 
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=16384)] 
        )
        print("Set GPU memory limit to 16GB.")
    except RuntimeError as e:
        print("Error setting memory limit:", e)
else:
    print("No GPUs available.")

#print("Is torch using cuda? ",torch.cuda.is_available())
print("Is tensorflow using cuda? ",tf.test.is_built_with_cuda())
print("Is pandas using cuda? ",pd)


In [None]:
name_mapping = [
    "box",
    "circularTorus",
    "cone",
    "coneOffset",
    "cylinder",
    "cylinderSlope",
    "dish",
    "mesh",
    "pyramid",
    "rectangularTorus",
    "sphere"
]

In [None]:
def sort_by_number(texts:list[str]):
    def key(text:str):
        text = re.sub(r'.*photos_', '', text)
        text = re.sub(r'\.csv', '', text)
        text = re.sub(r'\D', '', text)
        return int(text)
    return sorted(texts, key=key)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

base_path = '/home/workspace/geometry-classifier/data/'
parquet_files = glob.glob(base_path + 'photos_v3_parquet/*.parquet')

def data_generator():
    counter = 0
    encoder = LabelEncoder()
    
    for file in parquet_files:
        df = pd.read_parquet(file)
        
        df = df.drop(columns=['id'], axis=1)
        
        X_train, X_aux, y_train, y_aux = train_test_split(df.drop(columns=["name"]), df['name'], test_size=0.4, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.5, random_state=42)
        
        if len(X_train) != len(y_train) or len(X_val) != len(y_val):
            raise ValueError("Mismatch in number of samples between features and labels")
        
        if counter % 5 == 0:
            training_file = f"training_{counter // 5}.parquet"
            training_df = pd.DataFrame(X_train)
            training_df['name'] = y_train
            training_df.to_parquet(base_path + f"training/{training_file}")

        yield X_train, y_train, X_val, y_val
        
        counter += 1


In [66]:
base_path = '/home/workspace/geometry-classifier/data/'

def parse_tfrecord(example_proto):
    """Parse a single TFRecord example."""
    feature_description = {
        'name': tf.io.FixedLenFeature([], tf.string),
        'image': tf.io.FixedLenFeature([], tf.string)
    }
    parsed_example = tf.io.parse_single_example(example_proto, feature_description)
    name = tf.io.decode_raw(parsed_example['name'], tf.uint8)
    image = tf.io.decode_raw(parsed_example['image'], tf.uint8)
    image = tf.reshape(image, (224, 224, 1))  # Reshape image data to 224x224x3
    class_label = tf.cast(name[0], tf.int32)  # Use the first byte as a class label
    return image, class_label

def load_tfrecord(tfrecord_files):
    """Load TFRecords and prepare a dataset."""
    raw_dataset = tf.data.TFRecordDataset(tfrecord_files, compression_type="GZIP")
    parsed_dataset = raw_dataset.map(parse_tfrecord)
    return parsed_dataset

# Directory with TFRecords
tfrecord_files = glob.glob(f"{base_path}tfrecord/*.tfrecord.gz")
dataset = load_tfrecord(tfrecord_files)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [64]:
# 224 x 224

from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint

model = models.Sequential([
    layers.Conv2D(8, (3, 3), activation='relu', input_shape=(224,224,1)),
    layers.MaxPooling2D((2, 2)),
    

    layers.Conv2D(16, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    

    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    

    layers.Flatten(),
    
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax') 
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
!df -h

In [None]:
checkpoint_callback = ModelCheckpoint(base_path+'model.keras',save_best_only=True, save_weights_only=False, mode='min', verbose=1)

epochs = 100

for i in range(100):
    j = 0
    for X_train, Y_train, X_val, Y_val in data_generator():
        j+=1
        print(f" Epochs {i+1}/{epochs} - Datasets: {j}/1042")
        cnn = model.fit(X_train,Y_train, epochs=1, callbacks=[checkpoint_callback], batch_size=8, validation_data=(X_val, Y_val), verbose=1)



In [68]:

cnn = model.fit(dataset, epochs=50, steps_per_epoch=1024)

Epoch 1/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6091 - loss: 1.0389
Epoch 2/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6893 - loss: 0.9743
Epoch 3/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6124 - loss: 1.0860
Epoch 4/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.4691 - loss: 1.3168
Epoch 5/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6202 - loss: 1.0777
Epoch 6/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6091 - loss: 1.0390
Epoch 7/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6893 - loss: 0.9740
Epoch 8/50
[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.6124 - loss: 1.0859
Epoch 9/50

In [53]:
for image, label in dataset.take(1):
    print("Image shape:", image.numpy().shape)
    print("Label:", label.numpy())

Image shape: (32, 224, 224, 1)
Label: [7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
