In [290]:
import pandas as pd
import time
import logging

import sys
from pathlib import Path

# Go up one directory to get to master/
project_root = str(Path.cwd().parent)
sys.path.append(project_root)

from rolling_lookahead_dt_pulp import rollo_oct_pulp

there are options, e.g.:


# Binarizer: Converts numeric features to binary (0/1) based on a threshold.

from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.5)

X_binary = binarizer.fit_transform(X)


# OneHotEncoder: Converts categorical features to binary (one-hot encoding).


from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

X_binary = enc.fit_transform(X_categorical)



# for which kind of data doesnt make_data_binary work?

The make_data_binary function as written is not suitable for all types of data. Here are cases where it may fail or be inappropriate:

Text Data with Multiple Tokens or Lists:
If your DataFrame contains columns with lists of strings (e.g., ['one', 'two', 'three']), or multi-term entries, the function will not properly transform them into binary features. It expects each cell to be a single value, not a list or multi-valued string.

Ordinal Categorical Data:
The function treats all non-binary categorical columns as nominal (no order), and applies one-hot encoding. If your data has ordinal categories (e.g., 'low', 'medium', 'high'), this encoding discards the order information, which may not be optimal for modeling.

High-Cardinality Categorical Data:
If a categorical column has a very large number of unique values, one-hot encoding will create many new columns, leading to high dimensionality and computational inefficiency.

Continuous Numerical Data:
The function does not handle continuous numerical features (like age, income) well, as it will one-hot encode them if they have more than two unique values, which is almost never desired for continuous variables.

Missing Values in Target Column:
The function does not explicitly handle missing values in the 'y' column, only in features. If 'y' has missing values, the function may fail or produce unexpected results.

Non-Standard Data Types:
If your DataFrame contains columns with complex data types (e.g., dictionaries, custom objects), the function will not process them correctly.

Columns with Non-Binary Unique Values but Not Categorical:
If a column has exactly two unique values but is not meant to be treated as binary (e.g., a continuous variable with only two unique values due to sampling), the function will still treat it as binary, which may not be appropriate.

In summary:
The function works best for DataFrames with simple categorical and binary features, with a single, well-defined target column. It is not suitable for text with multiple terms, ordinal data, high-cardinality categoricals, continuous numerical features, complex data types, or missing values in the target column.

# Example Workflow

Input: DataFrame with mixed types, some missing values, and a target column 'y'.

Output: DataFrame with all features binary/one-hot encoded, 'y' as the first column, and columns renamed.

## Key Points

Missing values are handled by mode imputation.

Binary columns are ensured to be 0 and 1.

Non-binary columns are one-hot encoded.

The target column 'y' is made numerical and moved to the front.

Columns are renamed for consistency.

Logging is used throughout for transparency.



# Was man beachten muss:

needs target label column to be named 'y' => aus dem Datensatz immer erstmal dieses Spaltenlabel umbenennen

Es wird kompliziert, wenn die Target labels (jetzt 'y') im Datensatz nicht in der ersten Spalte stehen.
make_data_binary platziert y in der ersten Spalte und benennt alle features in 1,2,3... um. 
Das macht spätere zuordnung schwieriger bzw. man muss halt darauf achten

=> es ist also vll. schlau jeden Datensatz vor make_data_binary erstmal so zu prozessieren, dass Target label in erster Spalte steht. Zusätzlich kann man hier dann auch das umbenennen der Target label Spalte direkt machen => also evtl einfach das als helper Funktion implementieren
=> dann ergibt auch das spätere

#get features
feature_columns = train_data.columns[1:] #assuming labels are in first column

entsprechend Sinn

das ist dann zwar redundanter code, aber kostet keine Zeit
Generell gibts von denen auch einigen redundanten code. Zum Teil doppelt sich das was das preprocessing tut/prüft mit dem was make_data_binary tut

In [291]:
# muss dann in helpers.py kopiert werden

# moves cloumn with target labels to first column and renames it to 'y' and return reordered dataframe
def move_targets_to_front_and_rename(data: pd.DataFrame, target_label='y') -> pd.DataFrame:
    data.rename(columns={target_label: 'y'}, inplace=True)
    if data.columns[0] != 'y': # Checks if 'y' is not the first column; hier wurde vorher aus irgendeinem Grund data.columns[-1] != "y" geprüft, also ob 'y' in letzter Spalte war
        logging.info("Reordering y column at the beginning of data.")
        cols_ = list(data.columns)
        cols_.remove('y')
        cols_.insert(0, 'y')
        data = data[cols_]
    return data

In [292]:
# The function transforms input data (a pandas DataFrame) into a binary/one-hot encoded format suitable for certain machine learning tasks.
# It handles missing values, binary columns, categorical columns, and the target column separately.
# needs target label column to be named 'y'

def make_data_binary(data: pd.DataFrame) -> pd.DataFrame:
    """

    :param data: input data
    :return: data with binary columns
    """
    cols_with_missing = [col for col in data.columns #Identify Columns with Missing Values
                         if data[col].isnull().any()]
    if cols_with_missing:
        for col in cols_with_missing:
            data[col].fillna(data[col].mode()[0], inplace=True) # Replace Missing Values with Mode: For each column with missing values, fill them with the most frequent value (mode).
        logging.info("""There are columns with missing
            values.\nColumns are: {0}\n Replacing with mode. 
            """.format(cols_with_missing))
    
    

    binary_cols = [cname for cname in data.columns if # Find Columns with Exactly 2 Unique Values (excluding 'y');  list of column names that have exactly two unique values (excluding 'y')
                   data[cname].nunique() == 2 and cname != 'y']

    for col in binary_cols: # Convert Non-Integer Binary Columns to Integer: If a binary column is not already an integer type, convert it using category codes.
        if data[col].dtype not in ['int8', 'int16']: # check data type
            logging.info(f"Column {col} is not int type. Transforming it into "
                         f"integer.")
            data[col] = data[col].astype('category').cat.codes
            # astype('category'): Converts the column to a pandas "category" type for categorical variables (integer codes; cat.codes) (limited, fixed set of possible values). This is useful for columns with a small number of unique values.

    # Ensure Binary Columns are 0 and 1: If the unique values do not sum to 1 (i.e., not already 0 and 1), remap them to 0 and 1.
    if binary_cols: 
        for col in binary_cols:
            # if sum of unique entries is not equal to 1
            if sum(data[col].unique()) != 1:
                replace = {data[col].unique()[0]: 0, # Create a mapping dictionary where the first unique value maps to 0 and the second to 1. Apply this mapping to the entire column, converting all values to 0 or 1.
                           data[col].unique()[1]: 1} # Example: If the column has values ['A', 'B'], it will be mapped to {'A': 0, 'B': 1}
                data[col] = [replace[item] for item in data[col]]
        logging.info("There are {0} binary columns. \nColumns are: {1}".format(
            len(binary_cols), binary_cols))
    else:
        logging.info("No binary columns.")
        # so now it is ensured that binary columns contain strict binary (0/1) encoding


    total_col = 0 #expected total number of columns after all transformations; total number of columns after one-hot encoding and other transformations


    # Log the Number of Unique Values for Each Non-Target Column
    # non-binary columns (excluding 'y'), sum the number of unique values
    # This is used to check if one-hot encoding later produces the expected number of columns.
    for col in data.columns:
        if col != "y":
            logging.info("Column: {0} - Unique Values: {1}".format( #For each non-target column, log its name and the number of unique values it contains.
                col, data[col].nunique()))
            if data[col].nunique() != 2:
                total_col += data[col].nunique() #if a column does NOT have exactly 2 unique values (i.e., it’s not a binary column), add its number of unique values to total_col.
    total_col += len(binary_cols) + 1
    # This is preparation for one-hot encoding:
    # For categorical columns, one-hot encoding will create as many new columns as there are unique values.
    # For binary columns, no extra columns are needed beyond the original (since they are already handled separately).


    #the one hot encoding
    for col in data.columns:
        if col not in binary_cols and col != "y": #for Each Non-Binary, Non-Target Column:
            dummy_col = pd.get_dummies(data[col], prefix=col,dtype=int) #Creates a new DataFrame (dummy_col) where the original column is split into multiple binary (0/1) columns, one for each unique value in the original column
            data = pd.concat([data, dummy_col], axis=1) # adds new dummy columns to the original DataFrame
            data = data.drop(col, axis=1) # Removes the original column from the DataFrame, since it got replaced by one-hot encoding

    if total_col != data.shape[1]: # If the expected number of columns does not match the actual, log an error and return None
        logging.error("# of expected column is not equal to actual.")
        return None

    # Convert Target Column 'y' to Categorical Codes
    if data.y.dtype == "O": # checks if the column 'y' is of type object (usually strings or mixed types)
        logging.info("Converting y values into numerical.")
        data['y'] = data['y'].astype('category') # converts the column to a categorical type
        data['y'] = data['y'].cat.codes # replaces each unique value in 'y' with a numerical code (e.g., "cat" → 0, "dog" → 1, etc.)

    # ensure Target Values Are At Least 1, if not shift all values up by 1
    if data.y.min() < 1:
        data['y'] = data['y'] + 1
    # Reason: Some algorithms or libraries expect labels to start at 1 rather than 0

    # move Target Column 'y' to the Front
    if data.columns[0] != "y": # Checks if 'y' is not the first column; hier wurde vorher aus irgendeinem Grund data.columns[-1] != "y" geprüft, also ob 'y' in letzter Spalte war
        logging.info("Reordering y column at the beginning of data.")
        cols_ = list(data.columns)
        cols_.remove('y')
        cols_.insert(0, "y")
        data = data[cols_]

    # Rename All Columns Except 'y' to 1, 2, 3, ..., based on their position
    logging.info("Renaming columns..")
    column_indices = [i for i in range(1, len(data.columns))]
    new_names = column_indices
    old_names = data.columns[column_indices]
    data.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    logging.info("Data is binarized.")
    return data

# Implementierung für preprocessing numerical features
following approach in paper:
"For numerical features, we follow a two-step approach. If the number of unique values observed for a feature is less than 7, then it is treated as categorical. Otherwise, it is discretized into equal-sized groups based on sample quantiles."

After this one hot encoding is used on all features

In [293]:
# implementation of preprocessing for "For numerical features, we follow a two-step approach. If the number of unique values observed for a feature is less than 7, then it is treated as categorical. Otherwise, it is discretized into equal-sized groups based on sample quantiles."

def preprocess_numerical(data: pd.DataFrame) -> pd.DataFrame:
    binary_cols = [cname for cname in data.columns if data[cname].nunique() == 2] # Find Columns with Exactly 2 Unique Values; list of column names that have exactly two unique values
    int_cols_non_binary = [col for col in data.select_dtypes(include=['int']).columns 
                      if col not in binary_cols]
    for col in int_cols_non_binary:
        unique_vals = data[col].nunique()
        if unique_vals < 7:
            data[col] = data[col].astype('category')
        else:
            # Discretize into 4 equal-sized groups (adjust as needed)
            data[col] = pd.cut(data[col], bins=4, labels=False, duplicates='drop') #pd.cut does work for quantiles for us
            # Duplicates handling: The duplicates='drop' argument means that if the bin edges are not unique (which can happen if your data is very sparse or has repeated values), duplicate bin edges will be dropped, resulting in fewer bins than specified if necessary
            # Result: Each value in col is replaced by the bin number (as an integer) corresponding to its interval, or possibly fewer bins than 4 if duplicates are dropped.
    return data


In [294]:
# dataset has do be target labels and features together

# Load your training and test datasets
# train_data = pd.read_csv("data/train.csv")
# test_data = pd.read_csv("data/test.csv")




# train_data = move_targets_to_front_and_rename(data=train_data, target_label='y')
# train_data =make_data_binary(data=train_data)

# print(train_data)



features sollten am besten konsistent von 1 an nummeriert sein, das macht die spätere Zuordnung einfacher. Im Preprocessing wird das sowieso gemacht (Variable P im Code). So wäre also am Ende die Zuordnung um 1 verschoben.

Also eigentlich ist es kein Problem wenn man das weiß. Man muss nur wissen, dass das PCT preprocessing 1,2,3,... nummeriert. Dies korrespondiert eh dann zum Datensatz an dem in Spalte 0 die target labels stehen. Da sind die features dann auch Spalte 1,2,3,...

Ist aber eigentlich auch egal, weil one hot encoding eh die struktur zerstört

# Ausprobieren und zusätzliche Prozessierung von "load_breast_cancer" dataset
unsinnig für nur one hot encoding

In [295]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = pd.DataFrame(data.data), pd.DataFrame(data.target)
y.rename(columns={0: 'y'}, inplace=True) #rename targets to 'y'

print(y)
#print(y['y'].unique())

X.columns += 1 #works only if column Names are 

data = pd.concat([X, y], axis=1)

print(data)

     y
0    0
1    0
2    0
3    0
4    0
..  ..
564  0
565  0
566  0
567  0
568  1

[569 rows x 1 columns]
         1      2       3       4        5        6        7        8       9  \
0    17.99  10.38  122.80  1001.0  0.11840  0.27760  0.30010  0.14710  0.2419   
1    20.57  17.77  132.90  1326.0  0.08474  0.07864  0.08690  0.07017  0.1812   
2    19.69  21.25  130.00  1203.0  0.10960  0.15990  0.19740  0.12790  0.2069   
3    11.42  20.38   77.58   386.1  0.14250  0.28390  0.24140  0.10520  0.2597   
4    20.29  14.34  135.10  1297.0  0.10030  0.13280  0.19800  0.10430  0.1809   
..     ...    ...     ...     ...      ...      ...      ...      ...     ...   
564  21.56  22.39  142.00  1479.0  0.11100  0.11590  0.24390  0.13890  0.1726   
565  20.13  28.25  131.20  1261.0  0.09780  0.10340  0.14400  0.09791  0.1752   
566  16.60  28.08  108.30   858.1  0.08455  0.10230  0.09251  0.05302  0.1590   
567  20.60  29.33  140.10  1265.0  0.11780  0.27700  0.35140  0.15200  0.2397   
5

In [296]:
data = move_targets_to_front_and_rename(data=data, target_label='y')

print(data)

     y      1      2       3       4        5        6        7        8  \
0    0  17.99  10.38  122.80  1001.0  0.11840  0.27760  0.30010  0.14710   
1    0  20.57  17.77  132.90  1326.0  0.08474  0.07864  0.08690  0.07017   
2    0  19.69  21.25  130.00  1203.0  0.10960  0.15990  0.19740  0.12790   
3    0  11.42  20.38   77.58   386.1  0.14250  0.28390  0.24140  0.10520   
4    0  20.29  14.34  135.10  1297.0  0.10030  0.13280  0.19800  0.10430   
..  ..    ...    ...     ...     ...      ...      ...      ...      ...   
564  0  21.56  22.39  142.00  1479.0  0.11100  0.11590  0.24390  0.13890   
565  0  20.13  28.25  131.20  1261.0  0.09780  0.10340  0.14400  0.09791   
566  0  16.60  28.08  108.30   858.1  0.08455  0.10230  0.09251  0.05302   
567  0  20.60  29.33  140.10  1265.0  0.11780  0.27700  0.35140  0.15200   
568  1   7.76  24.54   47.92   181.0  0.05263  0.04362  0.00000  0.00000   

          9  ...      21     22      23      24       25       26      27  \
0    0.241

In [297]:
data = make_data_binary(data)

print(data)

     y  1  2  3  4  5  6  7  8  9  ...  15331  15332  15333  15334  15335  \
0    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
1    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
2    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
3    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
4    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
..  .. .. .. .. .. .. .. .. .. ..  ...    ...    ...    ...    ...    ...   
564  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
565  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
566  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
567  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
568  2  0  0  0  1  0  0  0  0  0  ...      0      0      0      0      0   

     15336  15337  15338  15339  15340  
0        0      0      0      0   

In [298]:
# from sklearn.model_selection import train_test_split

# # Suppose your DataFrame is called 'df'
# train, test = train_test_split(df, test_size=0.2, random_state=42)

train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

#print(train_data) #455 rows
print(test_data) #114 rows

     y  1  2  3  4  5  6  7  8  9  ...  15331  15332  15333  15334  15335  \
1    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
8    1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
13   1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
14   1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
20   2  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
..  .. .. .. .. .. .. .. .. .. ..  ...    ...    ...    ...    ...    ...   
552  2  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
554  2  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
560  2  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
563  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   
566  1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0      0   

     15336  15337  15338  15339  15340  
1        0      0      0      0   

In [299]:
# dataset has do be target labels and features together

# Load your training and test datasets
train_data = pd.read_csv("data/adult/adult.data", sep=',', skipinitialspace=True, header=None)
test_data = pd.read_csv("data/adult/adult.test", sep=',', skipinitialspace=True, header=None)

print(test_data)

       0             1       2             3   4                   5   \
0      25       Private  226802          11th   7       Never-married   
1      38       Private   89814       HS-grad   9  Married-civ-spouse   
2      28     Local-gov  336951    Assoc-acdm  12  Married-civ-spouse   
3      44       Private  160323  Some-college  10  Married-civ-spouse   
4      18             ?  103497  Some-college  10       Never-married   
...    ..           ...     ...           ...  ..                 ...   
16276  39       Private  215419     Bachelors  13            Divorced   
16277  64             ?  321403       HS-grad   9             Widowed   
16278  38       Private  374983     Bachelors  13  Married-civ-spouse   
16279  44       Private   83891     Bachelors  13            Divorced   
16280  35  Self-emp-inc  182148     Bachelors  13  Married-civ-spouse   

                      6               7                   8       9     10  \
0      Machine-op-inspct       Own-child     

In [300]:
test_data = preprocess_numerical(test_data)
print(test_data)

       0             1   2             3   4                   5   \
0       0       Private   0          11th   1       Never-married   
1       1       Private   0       HS-grad   2  Married-civ-spouse   
2       0     Local-gov   0    Assoc-acdm   2  Married-civ-spouse   
3       1       Private   0  Some-college   2  Married-civ-spouse   
4       0             ?   0  Some-college   2       Never-married   
...    ..           ...  ..           ...  ..                 ...   
16276   1       Private   0     Bachelors   3            Divorced   
16277   2             ?   0       HS-grad   2             Widowed   
16278   1       Private   0     Bachelors   3  Married-civ-spouse   
16279   1       Private   0     Bachelors   3            Divorced   
16280   0  Self-emp-inc   0     Bachelors   3  Married-civ-spouse   

                      6               7                   8       9   10  11  \
0      Machine-op-inspct       Own-child               Black    Male   0   0   
1        Fa

In [301]:
test_data = move_targets_to_front_and_rename(data= test_data , target_label=14)
print(test_data)

            y  0             1  2             3  4                   5  \
0      <=50K.  0       Private  0          11th  1       Never-married   
1      <=50K.  1       Private  0       HS-grad  2  Married-civ-spouse   
2       >50K.  0     Local-gov  0    Assoc-acdm  2  Married-civ-spouse   
3       >50K.  1       Private  0  Some-college  2  Married-civ-spouse   
4      <=50K.  0             ?  0  Some-college  2       Never-married   
...       ... ..           ... ..           ... ..                 ...   
16276  <=50K.  1       Private  0     Bachelors  3            Divorced   
16277  <=50K.  2             ?  0       HS-grad  2             Widowed   
16278  <=50K.  1       Private  0     Bachelors  3  Married-civ-spouse   
16279  <=50K.  1       Private  0     Bachelors  3            Divorced   
16280   >50K.  0  Self-emp-inc  0     Bachelors  3  Married-civ-spouse   

                       6               7                   8       9  10  11  \
0      Machine-op-inspct       

In [302]:
test_data = make_data_binary(test_data) # dass hier nur 124 Spalten rauskommen bedeutet, dass die Testdaten anders bootstrapped/preprocessed werden, als die Traningsdaten
print(test_data)



       y  1  2  3  4  5  6  7  8  9  ...  114  115  116  117  118  119  120  \
0      1  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      2  1  1  0  0  0  0  0  1  0  ...    0    0    0    0    0    0    0   
3      2  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  1  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
16276  1  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16277  1  1  0  0  1  0  1  0  0  0  ...    0    0    0    0    0    0    0   
16278  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16279  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16280  2  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       121  122  123  
0        1    0    0  
1    

In [303]:
train_data = preprocess_numerical(train_data)
train_data = move_targets_to_front_and_rename(data= train_data , target_label=14)
train_data = make_data_binary(train_data)
print(train_data)

       y  1  2  3  4  5  6  7  8  9  ...  115  116  117  118  119  120  121  \
0      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
3      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
32556  1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32557  2  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32558  1  0  0  0  1  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32559  1  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32560  2  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       122  123  124  
0        1    0    0  
1    

In [304]:
# wenn man adult dataset training und testdaten unabhängig voneinander preprozessiert, dann sorgt das dafür, dass in den trainingsdaten eine spalte da ist,
# welche beim preprozessieren der testdaten nicht auftaucht
# => beide datensätze zusammenfügen, preprozessieren und dann die letzten 16281 rows (testdaten) wieder aus dem datensatz holen
# das sollte auch nichts ändern, da es als binär categorical erkannt wird, also 

# dataset has do be target labels and features together

# Load your training and test datasets
train_data = pd.read_csv("data/adult/adult.data", sep=',', skipinitialspace=True, header=None) #32561 rows
test_data = pd.read_csv("data/adult/adult.test", sep=',', skipinitialspace=True, header=None) #16281 rows

# Remove dots from the 'target' column
test_data[14] = test_data[14].astype(str).str.replace('.', '', regex=False)

len_train_data = len(train_data)
#print(len_train_data)
#len_test_data = len(test_data)


stacked = pd.concat([train_data, test_data ], ignore_index=False)
#print(stacked)


stacked = preprocess_numerical(stacked)
stacked = move_targets_to_front_and_rename(data= stacked, target_label=14)

unique_values = stacked ['y'].unique()
print(unique_values) # hier sieht man dann, dass es einen zusätzlichen punkt bei den target labels in adult.test gibt; also ohne löschen des punktes



stacked = make_data_binary(stacked)
print(stacked)




train_data_bin = stacked.iloc[:len_train_data] #[32561 rows x 125 columns]
test_data_bin = stacked.iloc[len_train_data:] #[16281 rows x 125 columns]








['<=50K' '>50K']
       y  1  2  3  4  5  6  7  8  9  ...  115  116  117  118  119  120  121  \
0      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
3      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
16276  1  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16277  1  1  0  0  1  0  1  0  0  0  ...    0    0    0    0    0    0    0   
16278  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16279  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16280  2  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       122  123  124  
0        1 

In [305]:
print(train_data_bin)

       y  1  2  3  4  5  6  7  8  9  ...  115  116  117  118  119  120  121  \
0      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
3      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
32556  1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32557  2  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32558  1  0  0  0  1  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32559  1  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
32560  2  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       122  123  124  
0        1    0    0  
1    

In [306]:
print(test_data_bin)

       y  1  2  3  4  5  6  7  8  9  ...  115  116  117  118  119  120  121  \
0      1  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      2  1  1  0  0  0  0  0  1  0  ...    0    0    0    0    0    0    0   
3      2  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  1  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
16276  1  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16277  1  1  0  0  1  0  1  0  0  0  ...    0    0    0    0    0    0    0   
16278  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16279  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16280  2  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       122  123  124  
0        1    0    0  
1    

In [307]:
train_data=train_data_bin
test_data=test_data_bin

In [308]:
target_label = "y"
depth_rolling_tree = 3
criterion_loss = "gini"

In [309]:
# get features
feature_columns = train_data.columns[1:] #assuming labels are in first column, ensured trough move_targets_to_front_and_rename()
print(feature_columns)

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       115, 116, 117, 118, 119, 120, 121, 122, 123, 124],
      dtype='object', length=124)


In [None]:
# solving with pulp

start_time_pulp = time.time()

# Run the classifier using pulp
result_dict_pulp, result_df_test_data_pulp, result_df_training_data_pulp = rollo_oct_pulp.run(
                                                                train=train_data,
                                                                test=test_data,
                                                                target_label="y",
                                                                features=feature_columns,
                                                                depth=depth_rolling_tree,
                                                                criterion=criterion_loss
)
end_time_pulp = time.time()
print(f"Pulp execution time for depth {depth_rolling_tree} : {end_time_pulp - start_time_pulp} seconds")

SyntaxError: incomplete input (600210870.py, line 3)

In [None]:
print(result_dict_pulp)

{3: {'training_accuracy': 0.8215349651423483, 'test_accuracy': 0.0, 'time': 172.7234263420105}, 2: {'training_accuracy': 0.8193544424311293, 'test_accuracy': 0.0, 'time': 342.99163269996643}}


In [None]:
print(result_df_test_data_pulp)

       y  prediction  leaf
0      2           1    13
1      2           1     9
2      4           1     8
3      4           1     9
4      2           1    13
...   ..         ...   ...
16276  2           1    12
16277  2           1    13
16278  2           3    10
16279  2           1    12
16280  4           3    10

[16281 rows x 3 columns]


In [None]:
print(result_df_training_data_pulp)

       y  prediction  leaf
0      1           1    12
1      1           3    10
2      1           1    13
3      1           1    11
4      1           3    10
...   ..         ...   ...
32556  1           1     8
32557  3           1     9
32558  1           1    13
32559  1           1    13
32560  3           1     9

[32561 rows x 3 columns]
