In [5]:
# rockfall-prediction-system/notebooks/02_data_preprocessing.py

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('../data/rockfall_data.csv')

# Separate Features (X) and Target (y)
X = df.drop('rockfall_risk', axis=1)
y = df['rockfall_risk']

# Encode the Target Variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Original Labels:", label_encoder.classes_)
print("Encoded Labels:", np.unique(y_encoded))


# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)
print("\nShape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)

# Save Processed Data
processed_data_dir = '../data/processed'
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

X_train.to_csv(os.path.join(processed_data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_data_dir, 'X_test.csv'), index=False)
pd.Series(y_train).to_csv(os.path.join(processed_data_dir, 'y_train.csv'), index=False, header=['rockfall_risk'])
pd.Series(y_test).to_csv(os.path.join(processed_data_dir, 'y_test.csv'), index=False, header=['rockfall_risk'])

print("\nProcessed data saved successfully.")

Original Labels: ['Critical' 'High' 'Low' 'Medium']
Encoded Labels: [0 1 2 3]

Shape of training features: (4000, 5)
Shape of testing features: (1000, 5)

Processed data saved successfully.
