In [1]:
#TURBINE PERFOMANCE

import pandas as pd
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)  

#Specifinge path to CSV file
file_path = '/Users/katerinadenyakina/Desktop/Individual Project/datasets/df_merged.csv'

df_merged = pd.read_csv(file_path)

df_mergedColumnAnalysis = pd.DataFrame({"columns": df_merged.columns,
                                 "data Type": df_merged.dtypes.values,
                                 "Total Count": df_merged.count().values,
                                 "Unique Count": df_merged.nunique().values
                                })
df_mergedColumnAnalysis

Unnamed: 0,columns,data Type,Total Count,Unique Count
0,Turbine_ID,float64,125277,4
1,Timestamp,object,125277,33461
2,Gen_RPM_Max,float64,125277,9368
3,Gen_RPM_Min,float64,125277,8734
4,Gen_RPM_Avg,float64,125277,12981
5,Gen_RPM_Std,float64,125277,4707
6,Gen_Bear_Temp_Avg,float64,125277,76
7,Gen_Phase1_Temp_Avg,float64,125277,127
8,Gen_Phase2_Temp_Avg,float64,125277,120
9,Gen_Phase3_Temp_Avg,float64,125277,118


In [2]:
#Convert the 'Timestamp' column to datetime format
df_merged['Timestamp'] = pd.to_datetime(df_merged['Timestamp'])

#Converting 'Timestamp' column as it is  time-series data to use in ML
df_merged['year'] = df_merged['Timestamp'].dt.year
df_merged['month'] = df_merged['Timestamp'].dt.month
df_merged['day'] = df_merged['Timestamp'].dt.day
df_merged['hour'] = df_merged['Timestamp'].dt.hour
df_merged['minute'] = df_merged['Timestamp'].dt.minute
df_merged['second'] = df_merged['Timestamp'].dt.second

In [4]:
#Drop the 'Timestamp' column
df_merged.drop('Timestamp', axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt


#Target variables for prediction
target_columns = ['Gen_RPM_Max', 'Gen_RPM_Min', 'Gen_RPM_Avg','Rtr_RPM_Max', 'Rtr_RPM_Min', 'Rtr_RPM_Avg']

features = df_merged.drop(columns=target_columns)

targets = df_merged[target_columns]

#Training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

#Hyperparameters to tune
n_estimators = [10, 50, 100, 200]
max_depth = [None, 10, 20, 30]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

#Lists to store performance metrics
train_errors = []
validation_errors = []
train_r2_scores = []
validation_r2_scores = []

#Hyperparameter tuning loop
for estimator in n_estimators:
    for depth in max_depth:
        for split in min_samples_split:
            for leaf in min_samples_leaf:
                #Initialise the RandomForest Regressor with the hyperparameters
                rf_regressor = RandomForestRegressor(n_estimators=estimator,
                                                     max_depth=depth,
                                                     min_samples_split=split,
                                                     min_samples_leaf=leaf,
                                                     random_state=42)
               
                rf_regressor.fit(X_train, y_train)

                train_predictions = rf_regressor.predict(X_train)
                val_predictions = rf_regressor.predict(X_test)

                #Calculate MSE for both train and validation sets
                train_mse = mean_squared_error(y_train, train_predictions)
                val_mse = mean_squared_error(y_test, val_predictions)

                #Calculate R-squared for both train and validation sets
                train_r2 = r2_score(y_train, train_predictions)
                val_r2 = r2_score(y_test, val_predictions)

                #Add a list
                train_errors.append(train_mse)
                validation_errors.append(val_mse)
                train_r2_scores.append(train_r2)
                validation_r2_scores.append(val_r2)

                #Output current hyperparameters and their corresponding MSE and R-squared
                print(f"Estimators: {estimator}, Max Depth: {depth}, Min Samples Split: {split}, Min Samples Leaf: {leaf}")
                print(f"Train MSE: {train_mse}, Validation MSE: {val_mse}")
                print(f"Train R2: {train_r2}, Validation R2: {val_r2}\n")

#Plot results

#Convert to numpy arrays for easier indexing
train_errors = np.array(train_errors)
validation_errors = np.array(validation_errors)
train_r2_scores = np.array(train_r2_scores)
validation_r2_scores = np.array(validation_r2_scores)

#Reshape the arrays into matrices where each row represents different settings for one hyperparameter
train_errors = train_errors.reshape(len(n_estimators), -1)
validation_errors = validation_errors.reshape(len(n_estimators), -1)
train_r2_scores = train_r2_scores.reshape(len(n_estimators), -1)
validation_r2_scores = validation_r2_scores.reshape(len(n_estimators), -1)

#Use the mean performance at each level of n_estimators
mean_train_errors = train_errors.mean(axis=1)
mean_validation_errors = validation_errors.mean(axis=1)
mean_train_r2_scores = train_r2_scores.mean(axis=1)
mean_validation_r2_scores = validation_r2_scores.mean(axis=1)

#Create the learning curve figure for MSE
plt.figure(figsize=(12, 6))
plt.plot(n_estimators, mean_train_errors, label='Training error')
plt.plot(n_estimators, mean_validation_errors, label='Validation error')
plt.xlabel('Number of Trees')
plt.ylabel('MSE')
plt.title('Learning Curve for RandomForestRegressor')
plt.legend()
plt.show()

#Create the learning curve figure for R-Squared
plt.figure(figsize=(12, 6))
plt.plot(n_estimators, mean_train_r2_scores, label='Training R2 Score')
plt.plot(n_estimators, mean_validation_r2_scores, label='Validation R2 Score')
plt.xlabel('Number of Trees')
plt.ylabel('R2 Score')
plt.title('Learning Curve for RandomForestRegressor')
plt.legend()
plt.show()


Estimators: 10, Max Depth: None, Min Samples Split: 2, Min Samples Leaf: 1
Train MSE: 33.52812351287016, Validation MSE: 178.00547723698898
Train R2: 0.9997891701878711, Validation R2: 0.9988877507758946

Estimators: 10, Max Depth: None, Min Samples Split: 2, Min Samples Leaf: 2
Train MSE: 46.350945565929294, Validation MSE: 175.20996340621045
Train R2: 0.9996880565575891, Validation R2: 0.9989038995979332

Estimators: 10, Max Depth: None, Min Samples Split: 2, Min Samples Leaf: 4
Train MSE: 71.09305296281049, Validation MSE: 176.1408315134082
Train R2: 0.9995214565118785, Validation R2: 0.9988993081630687

Estimators: 10, Max Depth: None, Min Samples Split: 5, Min Samples Leaf: 1
Train MSE: 43.991814604791905, Validation MSE: 177.00712952903487
Train R2: 0.9997003489259114, Validation R2: 0.9988954159448206

Estimators: 10, Max Depth: None, Min Samples Split: 5, Min Samples Leaf: 2
Train MSE: 48.922841971439645, Validation MSE: 174.76940353888122
Train R2: 0.9996682092634886, Validati

Estimators: 50, Max Depth: None, Min Samples Split: 5, Min Samples Leaf: 4
Train MSE: 62.96727028819965, Validation MSE: 165.94961493443438
Train R2: 0.9995710864728653, Validation R2: 0.9989649139684015

Estimators: 50, Max Depth: None, Min Samples Split: 10, Min Samples Leaf: 1
Train MSE: 52.66883735596793, Validation MSE: 162.7840523077539
Train R2: 0.9996355472483112, Validation R2: 0.9989871655167814

Estimators: 50, Max Depth: None, Min Samples Split: 10, Min Samples Leaf: 2
Train MSE: 57.3773130852637, Validation MSE: 162.93221439093682
Train R2: 0.9996054826867898, Validation R2: 0.9989841593024501

Estimators: 50, Max Depth: None, Min Samples Split: 10, Min Samples Leaf: 4
Train MSE: 68.86273440877028, Validation MSE: 166.60124299372777
Train R2: 0.9995328398810277, Validation R2: 0.9989608066075707

Estimators: 50, Max Depth: 10, Min Samples Split: 2, Min Samples Leaf: 1
Train MSE: 161.2258376101265, Validation MSE: 204.52148317296738
Train R2: 0.99896949422754, Validation R2

Estimators: 100, Max Depth: 10, Min Samples Split: 2, Min Samples Leaf: 2
Train MSE: 161.98239100884723, Validation MSE: 202.6380000045135
Train R2: 0.9989668054926505, Validation R2: 0.9987324541204673

Estimators: 100, Max Depth: 10, Min Samples Split: 2, Min Samples Leaf: 4
Train MSE: 167.51251095696657, Validation MSE: 205.17796474160738
Train R2: 0.998934773610209, Validation R2: 0.99871786324464

Estimators: 100, Max Depth: 10, Min Samples Split: 5, Min Samples Leaf: 1
Train MSE: 161.41545081643997, Validation MSE: 202.89918952427902
Train R2: 0.9989698251755019, Validation R2: 0.9987311326966811

Estimators: 100, Max Depth: 10, Min Samples Split: 5, Min Samples Leaf: 2
Train MSE: 162.62113997911175, Validation MSE: 202.58675484118166
Train R2: 0.9989629963146198, Validation R2: 0.9987327717525297

Estimators: 100, Max Depth: 10, Min Samples Split: 5, Min Samples Leaf: 4
Train MSE: 167.51251095696657, Validation MSE: 205.17796474160738
Train R2: 0.998934773610209, Validation R2: 

KeyboardInterrupt: 

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold

features = df_merged.drop(columns=target_columns)

targets = df_merged[target_columns]

#K-Fold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#Initialising RandomForest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

#To store the cross-validation results
cv_results = {}

#Loop through each target column and perform cross-validation
for target in target_columns:
    scores = cross_val_score(rf_regressor, features, targets[target], cv=kf, scoring='r2', n_jobs=-1)
    
    #Store the results
    cv_results[target] = {
        'Mean R2 Score': np.mean(scores),
        'Standard Deviation R2 Score': np.std(scores)
    }

#Output the cross-validation results
for target, result in cv_results.items():
    print(f"Results for {target}:")
    print(f"Mean R2 Score: {result['Mean R2 Score']:.4f}")
    print(f"Standard Deviation R2 Score: {result['Standard Deviation R2 Score']:.4f}\n")


Results for Gen_RPM_Max:
Mean R2 Score: 0.9989
Standard Deviation R2 Score: 0.0001

Results for Gen_RPM_Min:
Mean R2 Score: 0.9989
Standard Deviation R2 Score: 0.0000

Results for Gen_RPM_Avg:
Mean R2 Score: 0.9996
Standard Deviation R2 Score: 0.0000

Results for Rtr_RPM_Max:
Mean R2 Score: 0.9990
Standard Deviation R2 Score: 0.0000

Results for Rtr_RPM_Min:
Mean R2 Score: 0.9985
Standard Deviation R2 Score: 0.0000

Results for Rtr_RPM_Avg:
Mean R2 Score: 0.9994
Standard Deviation R2 Score: 0.0000

