In [16]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Your data loading and preprocessing code remains the same...
df = pd.read_csv(
    r"D:\dtc-dr\data-analyse\continuous_factory_process.csv", delimiter=","
)
df = df.drop("time_stamp", axis=1)

prefixes_to_match = ["Machine1", "Machine2", "Machine3", "time_stamp"]
filtered_columns = [
    col
    for col in df.columns
    if any(col.startswith(prefix) for prefix in prefixes_to_match)
]
X = df[filtered_columns]

y = df[
    [
        "Stage1.Output.Measurement2.U.Actual",
    ]
]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Specify the number of features to select (k) 
k_best_features = 5

# Create the SelectKBest object with the f_regression score function
selector = SelectKBest(score_func=f_regression, k=k_best_features)

# Standardize the features
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Fit the selector on the training data and transform both the training and testing data
X_train_selected = selector.fit_transform(X_train_std, y_train)
X_test_selected = selector.transform(X_test_std)

# Reshape the target variables to 2D arrays
y_train_reshaped = y_train.values.reshape(-1, 1)
y_test_reshaped = y_test.values.reshape(-1, 1)

# Create linear regression models for each target variable
models = []
for col in y.columns:
    model = LinearRegression()
    model.fit(X_train_selected, y_train[col])
    models.append(model)

# Make predictions on the testing set
y_pred = pd.DataFrame(
    {
        col: model.predict(X_test_selected).reshape(-1)
        for col, model in zip(y.columns, models)
    }
)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.9336073501106869
R-squared: 0.057133771853288606


  y = column_or_1d(y, warn=True)
