In [1]:
# Installing dependencies

%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
print("Preparing features and labels...")
import cudf
import cupy as cp
from cuml.preprocessing import LabelEncoder

# Reading CSV file with cuDF (GPU)
df = cudf.read_csv('../data/processed_file_cleaned.csv')

# Checking columns to drop (since cuDF doesn't support errors='ignore')
columns_to_drop = [col for col in ['Label', 'mapped_label'] if col in df.columns]
X = df.drop(columns=columns_to_drop)

# Extracting target variable
y = df['mapped_label']

# Label encoding (on GPU)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Features and labels prepared successfully!")

Preparing features and labels...
Features and labels prepared successfully!


In [None]:
print("Splitting the dataset into training and testing sets...")

from cuml.model_selection import train_test_split
import pandas as pd

# cuDF DataFrame + cupy/cudf Series → tout sur le GPU
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded  # fonctionne avec cuDF Series
)

print("Dataset split successfully!")

print("\nSaving the datasets...")

X_train.to_pandas().to_csv("../data/train_test/X_train.csv", index=False)
X_test.to_pandas().to_csv("../data/train_test/X_test.csv", index=False)

pd.DataFrame(y_train.to_numpy()).to_csv("../data/train_test/y_train.csv", index=False)
pd.DataFrame(y_test.to_numpy()).to_csv("../data/train_test/y_test.csv", index=False)

print("Dataset saved in ../data/train_test/")