In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load your dataset
df = pd.read_csv('./data/cox_landslide_renamed.csv')

# 1. Create the target variable (assuming 'Landslide_ID' indicates landslide occurrence)
df['Landslide_Occurred'] = df['Landslide_ID'].notna().astype(int)  

# 2. Select features and target
X = df[['Elevation (m)', 'General_slope (degree)', 
        'Flow_accumulation_10yr_multiplied_log', 'Flow_accumulation_20yr_multiplied_log', 
        'Flow_accumulation_50yr_multiplied_log', 'Runoff_10yr', 'Runoff_20yr', 
        'Runoff_50yr', 'Slope_classification', 'Flow_direction', 'Material', 
        'Movement_type', 'State', 'Distribution', 'Style']]
y = df['Landslide_Occurred']

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Preprocessing pipeline
#  - Identify numerical and categorical features
numeric_features = ['Elevation (m)', 'General_slope (degree)', 
                    'Flow_accumulation_10yr_multiplied_log', 'Flow_accumulation_20yr_multiplied_log', 
                    'Flow_accumulation_50yr_multiplied_log', 'Runoff_10yr', 'Runoff_20yr', 
                    'Runoff_50yr', 'Flow_direction']  # Assuming 'Flow_direction' is numerical
categorical_features = ['Slope_classification', 'Material', 'Movement_type', 'State', 'Distribution', 'Style']

#  - Create transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

#  - Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5.  Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [7]:
print(X_train)

[[-0.46260523 -0.69912651 -0.83691944 ...  0.          0.
   1.        ]
 [-0.21141687 -0.52912614 -0.83691944 ...  0.          0.
   1.        ]
 [-0.37887578 -0.27412559 -0.83691944 ...  0.          0.
   1.        ]
 ...
 [-0.37887578 -0.52912614 -0.83691944 ...  0.          0.
   1.        ]
 [-0.57982647 -1.12412742 -0.87108539 ...  0.          0.
   1.        ]
 [ 0.12350094  1.42587807 -0.83691944 ...  0.          0.
   1.        ]]
