In [48]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import pandas as pd
import json

df = pd.read_csv(
    r"D:\dtc-dr\data-analyse\continuous_factory_process.csv", delimiter=","
)

prefixes_to_match = ["Machine1", "Machine2", "Machine3", "time_stamp"]

# Use list comprehension to filter columns based on prefixes
filtered_columns = [
    col
    for col in df.columns
    if any(col.startswith(prefix) for prefix in prefixes_to_match)
]

best_features = [2, 8, 10, 11, 12, 13, 14, 15, 19, 21, 22, 24, 25, 26, 27, 32, 34]

filtered_columns.remove("time_stamp")

filtered_columns = [filtered_columns[i] for i in best_features]
filtered_columns

X = df[filtered_columns]
# X = X.drop("time_stamp", axis=1)

y_columns = [
    "Stage1.Output.Measurement0.U.Actual",
    "Stage1.Output.Measurement1.U.Actual",
    "Stage1.Output.Measurement2.U.Actual",
    "Stage1.Output.Measurement3.U.Actual",
    "Stage1.Output.Measurement4.U.Actual",
    "Stage1.Output.Measurement5.U.Actual",
    "Stage1.Output.Measurement6.U.Actual",
    "Stage1.Output.Measurement7.U.Actual",
    "Stage1.Output.Measurement8.U.Actual",
    "Stage1.Output.Measurement9.U.Actual",
    "Stage1.Output.Measurement10.U.Actual",
    "Stage1.Output.Measurement11.U.Actual",
    "Stage1.Output.Measurement12.U.Actual",
    "Stage1.Output.Measurement13.U.Actual",
    "Stage1.Output.Measurement14.U.Actual",
    "FirstStage.CombinerOperation.Temperature1.U.Actual",
    "FirstStage.CombinerOperation.Temperature2.U.Actual",
    "FirstStage.CombinerOperation.Temperature3.C.Actual",
]

y = df["Stage1.Output.Measurement14.U.Actual"]

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Parameters
param_grid = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    # "max_features": ["auto", "sqrt", "log2"],
    # "criterion": [
    #     "squared_error",
    #     "friedman_mse",
    #     "absolute_error",
    #     "poisson",
    #     "squared_error",
    # ],
}

dt_model = DecisionTreeRegressor()

gcv = GridSearchCV(estimator=dt_model, param_grid=param_grid)
gcv.fit(X_train, y_train)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

y_train_pred = best_model.predict(X_train)

# Print results from the training set
train_r2_score = best_model.score(X_train, y_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

print(f"Training set - R-squared: {train_r2_score}")
print(f"Training set - Mean Squared Error: {train_mse}")
print(f"Training set - Root Mean Squared Error: {train_rmse}")

# Make predictions on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate the model's performance on the test set
test_r2_score = best_model.score(X_test, y_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

print(f"\nTest set - R-squared: {test_r2_score}")
print(f"Test set - Mean Squared Error: {test_mse}")
print(f"Test set - Root Mean Squared Error: {test_rmse}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Training set - R-squared: 0.5885713594805201
Training set - Mean Squared Error: 22.629495322704432
Training set - Root Mean Squared Error: 4.757046911972219

Test set - R-squared: 0.46371122838519985
Test set - Mean Squared Error: 29.492715591913353
Test set - Root Mean Squared Error: 5.430719620079217


In [49]:
from sklearn.model_selection import cross_val_score, KFold

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_reg = DecisionTreeRegressor(max_depth=20, min_samples_leaf=4, min_samples_split=5)

cv_train = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores_train = cross_val_score(dt_reg, X_train, y_train, cv=cv_train, scoring='r2')
mse_scores_train = -cross_val_score(dt_reg, X_train, y_train, cv=cv_train, scoring='neg_mean_squared_error')
rmse = np.sqrt(mse_scores_train)

# Print the cross-validation results on the training set
print(f'Training set - R-squared scores: {r2_scores_train}')
print(f'Training set - Mean R-squared: {np.mean(r2_scores_train)}')
print(f'Training set - Mean Squared Error scores: {mse_scores_train}')
print(f'Training set - Mean MSE: {np.mean(mse_scores_train)}')
print(f'Training set - RMSE: {np.mean(rmse)}')

# Define the cross-validation strategy (5-fold cross-validation) on the test set
cv_test = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation on the test set
r2_scores_test = cross_val_score(dt_reg, X_test, y_test, cv=cv_test, scoring='r2')
mse_scores_test = -cross_val_score(dt_reg, X_test, y_test, cv=cv_test, scoring='neg_mean_squared_error')
rmse = np.sqrt(mse_scores_test)

# Print the cross-validation results on the test set
print(f'Test set - R-squared scores: {r2_scores_test}')
print(f'Test set - Mean R-squared: {np.mean(r2_scores_test)}')
print(f'Test set - Mean Squared Error scores: {mse_scores_test}')
print(f'Test set - Mean MSE: {np.mean(mse_scores_test)}')
print(f'Test set - RMSE: {np.mean(rmse)}')

Training set - R-squared scores: [0.35698523 0.30331615 0.2947421  0.35014031 0.31160588]
Training set - Mean R-squared: 0.3233579322203418
Training set - Mean Squared Error scores: [35.59735843 38.45801833 38.47671782 35.4389035  37.79842024]
Training set - Mean MSE: 37.15388366363044
Training set - RMSE: 6.094373166448103
Test set - R-squared scores: [0.21637966 0.27370096 0.30409056 0.31761198 0.18339668]
Test set - Mean R-squared: 0.25903596941331797
Test set - Mean Squared Error scores: [43.04382393 40.9286849  36.86202954 38.68964793 43.88902044]
Test set - Mean MSE: 40.68264134910591
Test set - RMSE: 6.374944461663224


In [50]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt = DecisionTreeRegressor(max_depth=20, min_samples_leaf=4, min_samples_split=5)
dt_fit = dt.fit(X_train, y_train)
dt_scores = cross_val_score(dt_fit, X_train, y_train, cv = 5)
print("mean cross validation score: {}".format(np.mean(dt_scores)))
print("score without cv: {}".format(dt_fit.score(X_train, y_train)))

from sklearn.metrics import r2_score
print(r2_score(y_test, dt_fit.predict(X_test)))
print(dt_fit.score(X_test, y_test))




mean cross validation score: 0.3200035548139752
score without cv: 0.7542236610649029
0.34341309057117064
0.34341309057117064
