In [107]:
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.ensemble import BaseEnsemble
from scipy.stats import mode
import numpy as np

In [108]:
from rolling_lookahead_dt_pulp import rollo_oct_pulp
from sklearn.utils.validation import check_is_fitted

# was hiermit eben nicht geht ist, dass man auf Trainingsdaten trainiert (was einem das reine Modell geben sollte). Dabei werden aber leider gleichzeitig
# die Testdaten auf diesen Modell predicted
# Das Resultat ist also, dass man nicht andere Testdaten auf dem fertigen modell testen kann



class CustomTreeWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, depth=None, criterion='gini', test_data=None, target_label=None, features=None):
        self.depth = depth
        self.criterion = criterion
        self.test_data = test_data
        self.target_label = target_label
        self.features = features
    
    def fit(self, X, y):
        if self.test_data is None or self.target_label is None or self.features is None:
            raise ValueError("Test data, target label, and features must be set before calling fit")
        
        # Construct train DataFrame or array compatible with rollo_oct_pulp.run interface
        train_df = pd.concat([y, X], axis=1, ignore_index=False)
        #print(train_df)
        # Call your custom run with both train and stored test data
        self.result_dict_ = rollo_oct_pulp.run(
            train=train_df,
            test=self.test_data,
            target_label=self.target_label,
            features=self.features,
            depth=self.depth,
            criterion=self.criterion
        )
        self.is_fitted_ = True
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'is_fitted_')

        res = self.result_dict_['tree'][self.depth]['test']
        #check  = res.iloc[:, :1]
        #check_series = check.squeeze("columns") #convert to series

        #test = pd.DataFrame.equals(X, check_series)

        #if test:
        preds = res['prediction']
        if preds is None:
            raise RuntimeError("No stored predictions found. Run fit first.")
        return preds

In [109]:
"""
class CustomRandomForest(BaseEnsemble, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None):
        super().__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            random_state=random_state,
        )
        self.max_depth = max_depth

    def _validate_estimators(self):
        # This ensures base_estimator is cloned n_estimators times
        super()._validate_estimators()
        if self.base_estimator is None:
            self.base_estimator = CustomTreeClassifier(max_depth=self.max_depth)

    def fit(self, X, y):
        self._validate_estimators()
        self.estimators_ = []
        n_samples = X.shape[0]
        rng = np.random.RandomState(self.random_state)

        for i in range(self.n_estimators):
            # Bootstrap sample indices
            indices = rng.choice(n_samples, size=n_samples, replace=True)
            X_sample, y_sample = X[indices], y[indices]
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)
        return self

    def predict(self, X):
        # Gather predictions from all trees
        predictions = np.array([tree.predict(X) for tree in self.estimators_])
        maj_vote, _ = mode(predictions, axis=0)
        return maj_vote.ravel()
        """

'\nclass CustomRandomForest(BaseEnsemble, ClassifierMixin):\n    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None):\n        super().__init__(\n            base_estimator=base_estimator,\n            n_estimators=n_estimators,\n            random_state=random_state,\n        )\n        self.max_depth = max_depth\n\n    def _validate_estimators(self):\n        # This ensures base_estimator is cloned n_estimators times\n        super()._validate_estimators()\n        if self.base_estimator is None:\n            self.base_estimator = CustomTreeClassifier(max_depth=self.max_depth)\n\n    def fit(self, X, y):\n        self._validate_estimators()\n        self.estimators_ = []\n        n_samples = X.shape[0]\n        rng = np.random.RandomState(self.random_state)\n\n        for i in range(self.n_estimators):\n            # Bootstrap sample indices\n            indices = rng.choice(n_samples, size=n_samples, replace=True)\n            X_sample, y_samp

In [110]:
class CustomRandomForest(BaseEnsemble, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, 
                 random_state=None, test_data=None, target_label=None, features=None, criterion='gini'):
        super().__init__(n_estimators=n_estimators)
        self.random_state = random_state  # we store this ourselves
        self.base_estimator = base_estimator
        self.max_depth = max_depth
        self.test_data = test_data
        self.target_label = target_label
        self.features = features
        self.criterion = criterion

    def _validate_estimators(self):
        super()._validate_estimators()
        # Provide default base_estimator if none given
        if self.base_estimator is None:
            self.base_estimator = CustomTreeWrapper(
                depth=self.max_depth,
                criterion=self.criterion,
                test_data=self.test_data,
                target_label=self.target_label,
                features=self.features,
            )

    def fit(self, X, y):
        self._validate_estimators()
        self.estimators_ = []
        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        
        for i in range(self.n_estimators):
            # Bootstrap sample indices
            indices = rng.choice(n_samples, size=n_samples, replace=True)
            X_sample, y_sample = X.iloc[indices], y.iloc[indices]

            # Clone base estimator to get fresh copy
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'estimators_')
        
        # Collect predictions from each tree
        predictions = np.array([tree.predict(X) for tree in self.estimators_])
        
        # Majority vote along first axis (trees)
        maj_vote, _ = mode(predictions, axis=0)
        return maj_vote.ravel()

In [111]:
from sklearn.model_selection import train_test_split

data_seismic = pd.read_csv("datasets/seismic/seismic_bin.csv")


X = data_seismic.drop(columns=['y'])  # All columns except the target
y = data_seismic['y']                 # Only the target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, stratify=y, random_state=42)

stacked_train = pd.concat([y_train, X_train], axis=1, ignore_index=False)
stacked_test = pd.concat([y_test, X_test],axis=1, ignore_index=False)

train_data = stacked_train
test_data = stacked_test

#feature_columns = train_data.columns[1:]

feature_columns = X_train.columns
#print(feature_columns)

In [112]:
"""
# Usage outside:
wrapper = CustomTreeWrapper(
    depth=3,
    criterion='gini',
    test_data=test_data,
    target_label='y',
    features=feature_columns
)
wrapper.fit(X_train, y_train); #semicolon for suppressing when interactive environment (Jupyter) tries to display the returned object, suppresses repr(self)
"""

"\n# Usage outside:\nwrapper = CustomTreeWrapper(\n    depth=3,\n    criterion='gini',\n    test_data=test_data,\n    target_label='y',\n    features=feature_columns\n)\nwrapper.fit(X_train, y_train); #semicolon for suppressing when interactive environment (Jupyter) tries to display the returned object, suppresses repr(self)\n"

In [113]:
#predictions = wrapper.predict(X_test) #X_test must be the same that was used to create test_data; otherwise the logic of the wrapper breaks and will not give correct result
#print(predictions)

In [114]:
# Instantiate custom random forest using same test data fixed at construction
forest = CustomRandomForest(
    n_estimators=5,
    max_depth=3,
    random_state=42,
    test_data=test_data,
    target_label='y',
    features=feature_columns,
    criterion='gini',
)
# Fit on train data only - internally, each base estimator will use complete test data from constructor
forest.fit(X_train, y_train)

# Predict on test data (must be the same test_data-frame used)
forest_preds = forest.predict(test_data[feature_columns])
print("Random forest predictions:", forest_preds)


AttributeError: 'super' object has no attribute '_validate_estimators'