In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import pandas as pd
import json
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt


In [2]:
#best_features_indexed = [
               # 0, 1, 3, 4, 5, 6, 7, 8,
                #9, 10, 11, 12, 13, 14,
                #15, 16, 17, 18, 19, 20,
                #21, 22, 23, 24, 25, 26,
                #27, 28, 29, 30, 31, 32,
                #33, 34, 35
            #]

In [3]:
df = pd.read_csv(
    r"C:/Users/ennio/PycharmProjects\dtc-dr/models/feature-selection/continuous_factory_process.csv", delimiter=","
)

In [4]:
prefixes_to_match = ["Machine1", "Machine2", "Machine3", "time_stamp"]

# Use list comprehension to filter columns based on prefixes
filtered_columns = [
    col
    for col in df.columns
    if any(col.startswith(prefix) for prefix in prefixes_to_match)
]

In [5]:
X = df[filtered_columns]
X = X.drop("time_stamp", axis=1)

y_columns = [
    "Stage1.Output.Measurement0.U.Actual",
    "Stage1.Output.Measurement1.U.Actual",
    "Stage1.Output.Measurement2.U.Actual",
    "Stage1.Output.Measurement3.U.Actual",
    "Stage1.Output.Measurement4.U.Actual",
    "Stage1.Output.Measurement5.U.Actual",
    "Stage1.Output.Measurement6.U.Actual",
    "Stage1.Output.Measurement7.U.Actual",
    "Stage1.Output.Measurement8.U.Actual",
    "Stage1.Output.Measurement9.U.Actual",
    "Stage1.Output.Measurement10.U.Actual",
    "Stage1.Output.Measurement11.U.Actual",
    "Stage1.Output.Measurement12.U.Actual",
    "Stage1.Output.Measurement13.U.Actual",
    "Stage1.Output.Measurement14.U.Actual",
    "FirstStage.CombinerOperation.Temperature1.U.Actual",
    "FirstStage.CombinerOperation.Temperature2.U.Actual",
    "FirstStage.CombinerOperation.Temperature3.C.Actual",
]

y = df[y_columns]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# Parameters
param_grid = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    # "max_features": ["auto", "sqrt", "log2"],
    # "criterion": [
    #     "squared_error",
    #     "friedman_mse",
    #     "absolute_error",
    #     "poisson",
    #     "squared_error",
    # ],
}

In [15]:
def calculate_best_regression_model():
    results_dict = {}
    total_columns = len(y.columns)

    for col_index, col in enumerate(y.columns, start=1):
        col_results = {"r2_scores_train": [], "r2_scores_test": [], "selected_feature_indices": []}
        print(f"Processing column {col_index}/{total_columns}: {col}")


        for index in range(1, len(X.columns) + 1):
            print(f"\t-Processing index: {index}/{len(X.columns)}")
                # Use SelectKBest to select the top features based on f_regression
            k_best = SelectKBest(score_func=f_regression, k=index)
            X_train_selected = k_best.fit_transform(X_train, y_train[col])
            X_test_selected = k_best.transform(X_test)

            # Create and train a decision tree regression model
            rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
            gcv = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
            gcv.fit(X_train_selected, y_train[col])

            # Make predictions on both the training and test sets
            y_train_pred = gcv.predict(X_train_selected)
            y_test_pred = gcv.predict(X_test_selected)

            # Evaluate the model using R-squared for both sets
            r2_train = r2_score(y_train[col], y_train_pred)
            r2_test = r2_score(y_test[col], y_test_pred)

            col_results["r2_scores_train"].append(r2_train)
            col_results["r2_scores_test"].append(r2_test)

            # Print the selected features
            selected_feature_indices = np.where(k_best.get_support())[0]
            col_results["selected_feature_indices"].append(selected_feature_indices.tolist())

        results_dict[col] = col_results
    return results_dict

results = calculate_best_regression_model()

Processing column 1/18: Stage1.Output.Measurement0.U.Actual
	-Processing index: 1/36
	-Processing index: 2/36
	-Processing index: 3/36
	-Processing index: 4/36
	-Processing index: 5/36
	-Processing index: 6/36
	-Processing index: 7/36
	-Processing index: 8/36
	-Processing index: 9/36
	-Processing index: 10/36
	-Processing index: 11/36
	-Processing index: 12/36
	-Processing index: 13/36
	-Processing index: 14/36
	-Processing index: 15/36
	-Processing index: 16/36
	-Processing index: 17/36
	-Processing index: 18/36
	-Processing index: 19/36
	-Processing index: 20/36
	-Processing index: 21/36
	-Processing index: 22/36
	-Processing index: 23/36
	-Processing index: 24/36
	-Processing index: 25/36
	-Processing index: 26/36
	-Processing index: 27/36
	-Processing index: 28/36
	-Processing index: 29/36
	-Processing index: 30/36
	-Processing index: 31/36
	-Processing index: 32/36
	-Processing index: 33/36
	-Processing index: 34/36
	-Processing index: 35/36
	-Processing index: 36/36
Processing co

In [16]:
# Save the results to a JSON file
with open('Random_Forest_tree_results.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)

print("Results saved to test_decision_tree_results.json")

Results saved to test_decision_tree_results.json
