In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Control Dataset Function

In [8]:
def create_control_dataset(n_samples=1000, random_state=42,
                           transform_x1=1.0, transform_x2=1.0):
    """
    Generates a dataset and a transformed version with scaling.

    Parameters:
    - n_samples: number of samples
    - random_state: random seed
    - transform_x1: scaling factor for feature X1
    - transform_x2: scaling factor for feature X2

    Returns:
    - X_orig: original features (n_samples x 2)
    - X_transformed: transformed features (scaled) (n_samples x 2)
    - y: binary target (0 or 1)
    """
    np.random.seed(random_state)
    X1 = np.random.uniform(0, 10, size=n_samples)
    X2 = np.random.uniform(0, 10, size=n_samples)
    y = (X1 + X2 > 10).astype(int)

    X_orig = np.stack([X1, X2], axis=1)
    X_transformed = np.stack([X1 * transform_x1, X2 * transform_x2], axis=1)

    return X_orig, X_transformed, y

#### Experimental Setup

In [12]:
# First create control dataset
X_orig, X_transformed, y = create_control_dataset(
n_samples=1000,
random_state=42,
transform_x1=1,         # No scaling for feature X1         (IMPORTANT SETTING FOR EXPERIMENT)
transform_x2=10         # Scale feature X2 by 10            (IMPORTANT SETTING FOR EXPERIMENT)
)

In [14]:
# Show first 10 rows of original data
print("First 10 rows of X_orig:")
print(X_orig[:10])

# Show first 10 rows of transformed data
print("\nFirst 10 rows of X_transformed:")
print(X_transformed[:10])

# Show first 10 targets
print("\nFirst 10 values of y:")
print(y[:10])

First 10 rows of X_orig:
[[3.74540119 1.85132929]
 [9.50714306 5.41900947]
 [7.31993942 8.72945836]
 [5.98658484 7.32224886]
 [1.5601864  8.06561148]
 [1.5599452  6.58783367]
 [0.58083612 6.92276565]
 [8.66176146 8.49195652]
 [6.01115012 2.49668009]
 [7.08072578 4.89424964]]

First 10 rows of X_transformed:
[[ 3.74540119 18.51329288]
 [ 9.50714306 54.19009474]
 [ 7.31993942 87.29458359]
 [ 5.98658484 73.22248864]
 [ 1.5601864  80.65611479]
 [ 1.5599452  65.87833667]
 [ 0.58083612 69.22765645]
 [ 8.66176146 84.91956516]
 [ 6.01115012 24.96680089]
 [ 7.08072578 48.94249636]]

First 10 values of y:
[0 1 1 1 0 0 0 1 0 1]


In [9]:
# Then we use the same split indices for both X_orig and X_transformed
X_train_idx, X_test_idx, y_train, y_test = train_test_split(
    np.arange(len(y)), y, test_size=0.3, random_state=42
)

# Apply split indices to X_orig and X_transformed
X_train_orig = X_orig[X_train_idx]
X_test_orig = X_orig[X_test_idx]

X_train_trans = X_transformed[X_train_idx]
X_test_trans = X_transformed[X_test_idx]

#### Run Experiments

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train RF on original data
rf_orig = RandomForestClassifier(random_state=42, n_estimators=100)
rf_orig.fit(X_train_orig, y_train)
y_pred_orig = rf_orig.predict(X_test_orig)
acc_orig = accuracy_score(y_test, y_pred_orig)

# Train RF on transformed data
rf_trans = RandomForestClassifier(random_state=42, n_estimators=100)
rf_trans.fit(X_train_trans, y_train)
y_pred_trans = rf_trans.predict(X_test_trans)
acc_trans = accuracy_score(y_test, y_pred_trans)

# Report results
print(f"Accuracy on original data:   {acc_orig:.3f}")
print(f"Accuracy on transformed data: {acc_trans:.3f}")


Accuracy on original data:   0.983
Accuracy on transformed data: 0.983
