In [175]:
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.ensemble import BaseEnsemble
from scipy.stats import mode
import numpy as np

In [None]:
from rolling_lookahead_dt_pulp import rollo_oct_pulp
from sklearn.utils.validation import check_is_fitted
#from rolling_lookahead_dt_pulp.oct.optimal_tree_pulp import predict_model_pulp

from rolling_lookahead_dt_pulp.rolling_tree.rolling_optimize_pulp import rolling_optimize_pulp
from rolling_lookahead_dt_pulp.oct.tree import *
from rolling_lookahead_dt_pulp.oct.optimal_tree_pulp import *
from helpers.helpers import preprocess_dataframes

# was hiermit eben nicht geht ist, dass man auf Trainingsdaten trainiert (was einem das reine Modell geben sollte). Dabei werden aber leider gleichzeitig
# die Testdaten auf diesen Modell predicted
# Das Resultat ist also, dass man nicht andere Testdaten auf dem fertigen modell testen kann

class CustomTreeWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, train_data, test_data, depth=None, criterion='gini', target_label=None, features=None, time_limit = 1800, big_m = 99):
        self.depth = depth
        self.criterion = criterion
        self.test_data = test_data
        self.train_data = train_data
        self.target_label = target_label
        self.features = features
        self.time_limit = time_limit
        self.big_m = big_m
        self.construct()

    def construct(self):
        train, test = preprocess_dataframes( #./rollo_oct/utils/helpers.py
        train_df = self.train_data,
        test_df = self.test_data,
        target_label = self.target_label,
        features = self.features)

        df = pd.concat([train, test])
        self.P = [int(i) for i in
            list(train.loc[:, train.columns != 'y'].columns)]
        train.columns = ["y", *self.P]
        test.columns = ["y", *self.P]
        self.K = sorted(list(set(df.y)))

        self.result_dict = {} #adding dict to store solutions for every level
        self.result_dict['tree'] = {}
        self.result_dict['tree'][2] = {}
        
        # generate model
        self.main_model = generate_model_pulp(P=self.P, K=self.K, data=train, y_idx=0, big_m=self.big_m, criterion=self.criterion)
    
    def fit(self, X, y):

        self.train_data = pd.concat([y, X], axis=1, ignore_index=False)

        train, test = preprocess_dataframes( #./rollo_oct/utils/helpers.py
                                            train_df = self.train_data,
                                            test_df = self.test_data,
                                            target_label = self.target_label,
                                            features = self.features)
        
        self.P = [int(i) for i in 
            list(train.loc[:, train.columns != 'y'].columns)]
        
        self.main_model = train_model_pulp(model_dict=self.main_model, data=train, P=self.P)

        self.result_dict['tree'][2]['trained_dict'] = self.main_model

        # predict model
        result_train = predict_model_pulp(data=train, model_dict=self.main_model, P=self.P)

        misclassified_leafs = find_misclassification(df=result_train)

        result_test = predict_model_pulp(data=test, model_dict=self.main_model, P=self.P)
        
        
        train_acc = len(result_train.loc[result_train["prediction"] == result_train["y"]]) / \
                    len(result_train["y"])

        test_acc = len(result_test.loc[result_test["prediction"] == result_test["y"]]) / \
                len(result_test["y"])
        
        
        self.result_dict['tree'][2]['train'] = result_train[['y', 'prediction', 'leaf']]
        self.result_dict['tree'][2]['test'] = result_test[['y', 'prediction', 'leaf']]

        self.result_dict[2] = {
        "training_accuracy": train_acc,
        "test_accuracy": test_acc
        }

        train = train.drop(["prediction", "leaf"], axis=1)
        test = test.drop(["prediction", "leaf"], axis=1)

        if self.depth > 2:
            self.result_dict = rolling_optimize_pulp(predefined_model=self.main_model,
                                            train_data=train,
                                            test_data=test,
                                            main_depth=2,
                                            target_depth=self.depth,
                                            features=self.P,
                                            time_limit=self.time_limit,
                                            to_go_deep_nodes=misclassified_leafs,
                                            result_dict=self.result_dict,
                                            criterion=self.criterion)

        self.is_fitted_ = True
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'is_fitted_')

        #print(X)

        model_dict = self.result_dict['tree'][self.depth]['trained_dict']

        dummy = pd.DataFrame({'y': [None]*len(X)}, index=X.index)

        test = pd.concat([dummy, X], axis=1)

        #print(test)

        res = predict_model_pulp(data=test, model_dict=model_dict, P=self.P)

        #print(res)
        
        preds = res['prediction']
        if preds is None:
            raise RuntimeError("No stored predictions found. Run fit first.")
        
        #check = self.result_dict['tree'][self.depth]['test']
        #check = check.drop(columns=['y', 'leaf'])
        #print(check)

        #print(preds.equals(check['prediction']))
        return preds

In [177]:
"""
class CustomRandomForest(BaseEnsemble, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None):
        super().__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            random_state=random_state,
        )
        self.max_depth = max_depth

    def _validate_estimators(self):
        # This ensures base_estimator is cloned n_estimators times
        super()._validate_estimators()
        if self.base_estimator is None:
            self.base_estimator = CustomTreeClassifier(max_depth=self.max_depth)

    def fit(self, X, y):
        self._validate_estimators()
        self.estimators_ = []
        n_samples = X.shape[0]
        rng = np.random.RandomState(self.random_state)

        for i in range(self.n_estimators):
            # Bootstrap sample indices
            indices = rng.choice(n_samples, size=n_samples, replace=True)
            X_sample, y_sample = X[indices], y[indices]
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)
        return self

    def predict(self, X):
        # Gather predictions from all trees
        predictions = np.array([tree.predict(X) for tree in self.estimators_])
        maj_vote, _ = mode(predictions, axis=0)
        return maj_vote.ravel()
        """

'\nclass CustomRandomForest(BaseEnsemble, ClassifierMixin):\n    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None):\n        super().__init__(\n            base_estimator=base_estimator,\n            n_estimators=n_estimators,\n            random_state=random_state,\n        )\n        self.max_depth = max_depth\n\n    def _validate_estimators(self):\n        # This ensures base_estimator is cloned n_estimators times\n        super()._validate_estimators()\n        if self.base_estimator is None:\n            self.base_estimator = CustomTreeClassifier(max_depth=self.max_depth)\n\n    def fit(self, X, y):\n        self._validate_estimators()\n        self.estimators_ = []\n        n_samples = X.shape[0]\n        rng = np.random.RandomState(self.random_state)\n\n        for i in range(self.n_estimators):\n            # Bootstrap sample indices\n            indices = rng.choice(n_samples, size=n_samples, replace=True)\n            X_sample, y_samp

In [None]:
"""
# just archived
class CustomRandomForest(BaseEnsemble, ClassifierMixin):
    def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, 
                 random_state=None, test_data=None, target_label=None, features=None, criterion='gini'):
        super().__init__(n_estimators=n_estimators)
        self.random_state = random_state  # we store this ourselves
        self.base_estimator = base_estimator
        self.max_depth = max_depth
        self.test_data = test_data
        self.target_label = target_label
        self.features = features
        self.criterion = criterion

    def _validate_estimators(self):
        super()._validate_estimators()
        # Provide default base_estimator if none given
        if self.base_estimator is None:
            self.base_estimator = CustomTreeWrapper(
                depth=self.max_depth,
                criterion=self.criterion,
                test_data=self.test_data,
                target_label=self.target_label,
                features=self.features,
            )

    def fit(self, X, y):
        self._validate_estimators()
        self.estimators_ = []
        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        
        for i in range(self.n_estimators):
            # Bootstrap sample indices
            indices = rng.choice(n_samples, size=n_samples, replace=True)
            X_sample, y_sample = X.iloc[indices], y.iloc[indices]

            # Clone base estimator to get fresh copy
            estimator = clone(self.base_estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'estimators_')
        
        # Collect predictions from each tree
        predictions = np.array([tree.predict(X) for tree in self.estimators_])
        
        
        # Majority vote along first axis (trees)
        maj_vote, _ = mode(predictions, axis=0)
        return maj_vote.ravel()
"""

In [179]:
from sklearn.model_selection import train_test_split

data_seismic = pd.read_csv("datasets/seismic/seismic_bin.csv")


X = data_seismic.drop(columns=['y'])  # All columns except the target
y = data_seismic['y']                 # Only the target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, stratify=y, random_state=42)

stacked_train = pd.concat([y_train, X_train], axis=1, ignore_index=False)
stacked_test = pd.concat([y_test, X_test],axis=1, ignore_index=False)

train_data = stacked_train
test_data = stacked_test

#feature_columns = train_data.columns[1:]

feature_columns = X_train.columns
#print(feature_columns)

In [180]:
# Usage outside:
wrapper = CustomTreeWrapper(
    depth=3,
    criterion='gini',
    test_data=test_data,
    train_data= train_data,
    target_label='y',
    features=feature_columns
)
wrapper.fit(X_train, y_train); #semicolon for suppressing when interactive environment (Jupyter) tries to display the returned object, suppresses repr(self)


{'leaf_nodes': [4, 5, 6, 7], 'leaf_nodes_path': {4: [1, 1], 5: [1, 0], 6: [0, 1], 7: [0, 0]}}
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/drood/Obsidian/Files/Bachelorarbeit/rlrf_my_try/.venv/lib/python3.12/site-packages/pulp/apis/../solverdir/cbc/linux/i64/cbc /tmp/7f538600bcdc49f2bce6e6706ceda76a-pulp.mps -sec 1800 -timeMode elapsed -branch -printingOptions all -solution /tmp/7f538600bcdc49f2bce6e6706ceda76a-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 54 COLUMNS
At line 22109 RHS
At line 22159 BOUNDS
At line 26578 ENDATA
Problem MODEL has 49 rows, 4418 columns and 8836 elements
Coin0008I MODEL read with 0 errors
seconds was changed from 1e+100 to 1800
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.112518 - 0.00 seconds
Cgl0004I processed model has 0 rows, 0 columns (0 integer (0 of which binary)) and 0 elements
Cbc3007W No integer variables - nothing to do
Cuts 

  x = np.array(i[P]) #array of all feature values \in {0,1} of that row
  x = np.array(i[P]) #array of all feature values \in {0,1} of that row


{'leaf_nodes': [4, 5, 6, 7], 'leaf_nodes_path': {4: [1, 1], 5: [1, 0], 6: [0, 1], 7: [0, 0]}}
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/drood/Obsidian/Files/Bachelorarbeit/rlrf_my_try/.venv/lib/python3.12/site-packages/pulp/apis/../solverdir/cbc/linux/i64/cbc /tmp/a9612d2826f74a32babbd626729d7ce2-pulp.mps -sec 1800 -timeMode elapsed -branch -printingOptions all -solution /tmp/a9612d2826f74a32babbd626729d7ce2-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 54 COLUMNS
At line 22119 RHS
At line 22169 BOUNDS
At line 26588 ENDATA
Problem MODEL has 49 rows, 4418 columns and 8836 elements
Coin0008I MODEL read with 0 errors
seconds was changed from 1e+100 to 1800
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.240947 - 0.00 seconds
Cgl0004I processed model has 0 rows, 0 columns (0 integer (0 of which binary)) and 0 elements
Cbc3007W No integer variables - nothing to do
Cuts 

  x = np.array(i[P]) #array of all feature values \in {0,1} of that row
  x = np.array(i[P]) #array of all feature values \in {0,1} of that row


In [181]:
#predictions = wrapper.predict(test_data)
predictions = wrapper.predict(X_test)
print(predictions)

1660    1
1216    1
2483    1
248     1
455     1
       ..
21      1
1073    1
665     1
1192    1
1250    1
Name: prediction, Length: 100, dtype: int64


  x = np.array(i[P]) #array of all feature values \in {0,1} of that row


In [182]:
"""
# Instantiate custom random forest using same test data fixed at construction
forest = CustomRandomForest(
    n_estimators=5,
    max_depth=3,
    random_state=42,
    test_data=test_data,
    target_label='y',
    features=feature_columns,
    criterion='gini',
)
# Fit on train data only - internally, each base estimator will use complete test data from constructor
forest.fit(X_train, y_train)

# Predict on test data (must be the same test_data-frame used)
forest_preds = forest.predict(test_data[feature_columns])
print("Random forest predictions:", forest_preds)
"""


'\n# Instantiate custom random forest using same test data fixed at construction\nforest = CustomRandomForest(\n    n_estimators=5,\n    max_depth=3,\n    random_state=42,\n    test_data=test_data,\n    target_label=\'y\',\n    features=feature_columns,\n    criterion=\'gini\',\n)\n# Fit on train data only - internally, each base estimator will use complete test data from constructor\nforest.fit(X_train, y_train)\n\n# Predict on test data (must be the same test_data-frame used)\nforest_preds = forest.predict(test_data[feature_columns])\nprint("Random forest predictions:", forest_preds)\n'