In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
%store -r df_train_3
%store -r df_test_3
%store -r df_RUL_3

In [3]:
df_train_3.head()

Unnamed: 0,ID,Cycle,operarional_setting_1,operarional_setting_2,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_15,sensor_measurement_17,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,-0.0005,0.0004,642.36,1583.23,1396.84,553.97,2387.96,9062.17,47.3,522.31,2388.01,8.4246,391,39.11,23.3537,258
1,1,2,0.0008,-0.0003,642.5,1584.69,1396.89,554.55,2388.0,9061.78,47.23,522.42,2388.03,8.4403,392,38.99,23.4491,257
2,1,3,-0.0014,-0.0002,642.18,1582.35,1405.61,554.43,2388.03,9070.23,47.22,522.03,2388.0,8.3901,391,38.85,23.3669,256
3,1,4,-0.002,0.0001,642.92,1585.61,1392.27,555.21,2388.0,9064.57,47.24,522.49,2388.08,8.3878,392,38.96,23.2951,255
4,1,5,0.0016,0.0,641.68,1588.63,1397.65,554.74,2388.04,9076.14,47.15,522.58,2388.03,8.3869,392,39.14,23.4583,254


In [4]:
df_test_3.head()

Unnamed: 0,ID,Cycle,operarional_setting_1,operarional_setting_2,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_15,sensor_measurement_17,sensor_measurement_20,sensor_measurement_21
0,1,1,-0.0017,-0.0004,641.94,1581.93,1396.93,554.56,2387.93,9048.65,47.09,521.89,2387.94,8.376,391,39.07,23.4468
1,1,2,0.0006,-0.0002,642.02,1584.86,1398.9,554.1,2387.94,9046.53,47.08,521.85,2388.01,8.4062,391,39.04,23.4807
2,1,3,0.0014,-0.0003,641.68,1581.78,1391.92,554.41,2387.97,9054.92,47.15,522.1,2387.94,8.3553,391,39.1,23.4244
3,1,4,0.0027,0.0001,642.2,1584.53,1395.34,554.58,2387.94,9055.04,47.26,522.45,2387.96,8.3709,392,38.97,23.4782
4,1,5,-0.0001,0.0001,642.46,1589.03,1395.86,554.16,2388.01,9048.59,46.94,521.91,2387.97,8.4146,391,39.09,23.395


In [5]:
df_RUL_3.head()

Unnamed: 0,RUL
0,44
1,51
2,27
3,120
4,101


# RandomForestRegressor

In [7]:
labels_to_drop = ["ID","Cycle"]
X_train3 = df_train_3.drop(labels_to_drop+["RUL"], axis=1)
y_train3 = df_train_3["RUL"]

X_test3 = df_test_3.groupby("ID").last().reset_index().drop(labels_to_drop, axis=1)
y_test3 = df_RUL_3
print("X_train columns: ", X_train3.columns, "\n",
     "X_test columns: ", X_test3.columns, "\n",
     y_train3.shape)

X_train columns:  Index(['operarional_setting_1', 'operarional_setting_2', 'sensor_measurement_2', 'sensor_measurement_3', 'sensor_measurement_4', 'sensor_measurement_7', 'sensor_measurement_8', 'sensor_measurement_9', 'sensor_measurement_11', 'sensor_measurement_12', 'sensor_measurement_13', 'sensor_measurement_15', 'sensor_measurement_17', 'sensor_measurement_20', 'sensor_measurement_21'], dtype='object') 
 X_test columns:  Index(['operarional_setting_1', 'operarional_setting_2', 'sensor_measurement_2', 'sensor_measurement_3', 'sensor_measurement_4', 'sensor_measurement_7', 'sensor_measurement_8', 'sensor_measurement_9', 'sensor_measurement_11', 'sensor_measurement_12', 'sensor_measurement_13', 'sensor_measurement_15', 'sensor_measurement_17', 'sensor_measurement_20', 'sensor_measurement_21'], dtype='object') 
 (24720,)


In [6]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}
print(random_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]}


In [8]:
regr = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=regr, param_distributions=random_grid, random_state=42)
rf_random.fit(X_train3, y_train3)

RandomizedSearchCV(estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110],
                                        'n_estimators': [10, 20, 30, 40, 50, 60,
                                                         70, 80, 90, 100]},
                   random_state=42)

In [9]:
n_estimators = list(rf_random.best_params_.values())[0]
max_depth = list(rf_random.best_params_.values())[1]
hyper_regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
hyper_regr.fit(X_train3, y_train3)

RandomForestRegressor(max_depth=10, n_estimators=50)

In [10]:
preds_model = hyper_regr.predict(X_train3)
print("TRAIN METRICS for RF 3", "\n",
                  "MAE: ", median_absolute_error(y_train3, preds_model),
                  "MSE: ", mean_squared_error(y_train3, preds_model),
                  "RMSE: ",np.sqrt(mean_squared_error(y_train3, preds_model)),
                  "R2 score: ", r2_score(y_train3, preds_model))

TRAIN METRICS for RF 3 
 MAE:  23.868694533404955 MSE:  2706.095339045164 RMSE:  52.02014358924016 R2 score:  0.7230275784246996


In [11]:
preds_model_test = hyper_regr.predict(X_test3)
print("TRAIN METRICS for RF 3", "\n",
                  "MAE: ", median_absolute_error(y_test3, preds_model_test),
                  "MSE: ", mean_squared_error(y_test3, preds_model_test),
                  "RMSE: ",np.sqrt(mean_squared_error(y_test3, preds_model_test)),
                  "R2 score: ", r2_score(y_test3, preds_model_test))

TRAIN METRICS for RF 3 
 MAE:  25.697487520580445 MSE:  2377.601088391315 RMSE:  48.760651025097225 R2 score:  -0.3874909070783683


Addition of y_clip

In [14]:
clips = np.arange(110,160,10).tolist()
train_scores_rf = []
test_scores_rf = []

X_train = df_train_3.drop(columns=["ID","Cycle","RUL","operarional_setting_1","operarional_setting_2"], axis=1)
df_test_1_red = df_test_3.groupby("ID").last().reset_index()
X_test = df_test_3.drop(columns=["ID","Cycle","operarional_setting_1","operarional_setting_2"], axis=1)
y_test = df_RUL_3

for clip in clips:
    
    y_train_clip = df_train_3["RUL"].clip(upper=clip)


    regr = RandomForestRegressor()
    rf_random_clip = RandomizedSearchCV(estimator=regr, param_distributions=random_grid, random_state=42)
    rf_random_clip.fit(X_train, y_train_clip)
    n_estimators = list(rf_random_clip.best_params_.values())[0]
    max_depth = list(rf_random_clip.best_params_.values())[1]
    hyper_regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    hyper_regr.fit(X_train, y_train_clip)
    
    
    rul_train_pred_rf = hyper_regr.predict(X_train)
    r2_tr = r2_score(y_train_clip, rul_train_pred_rf)
    train_scores_rf.append(r2_tr)
       
    rul_test_pred_rf = hyper_regr.predict(X_test)
    r2_ts = r2_score(y_test, rul_test_pred_rf)
    test_scores_rf.append(r2_ts)
    
    print("TRAIN METRICS for {} upper clip".format(clip),"               ","TEST METRICS for {} upper clip".format(clip), "\n",
                  "MAE: ", median_absolute_error(y_train_clip, rul_train_pred_rf),"                       ","MAE: ", median_absolute_error(y_test, rul_test_pred_rf), "\n",
                  "MSE: ", mean_squared_error(y_train_clip, rul_train_pred_rf),"                       ","MSE: ", mean_squared_error(y_test, rul_test_pred_rf), "\n",
                  "RMSE: ",np.sqrt(mean_squared_error(y_train_clip, rul_train_pred_rf)),"                      ","RMSE: ",np.sqrt(mean_squared_error(y_test, rul_test_pred_rf)), "\n",
                  "R2 score: ", r2_score(y_train_clip, rul_train_pred_rf),"                 ","R2 score: ", r2_score(y_test, rul_test_pred_rf), "\n", "\n")

ValueError: Found input variables with inconsistent numbers of samples: [100, 16596]

In [None]:
plt.plot(range(0,len(clips)),train_scores_rf,label="train scores")
plt.plot(range(0,len(clips)), test_scores_rf, label="test scores")
plt.legend()
mx_test = max(test_scores_rf)
mx_idx = test_scores_rf.index(mx_test)
train_mx_test = train_scores_rf[mx_idx]
top_clip = clips[mx_idx]
print("Train Score from max Test Score: ", train_mx_test, "\n",
      "Max Test Score: ", mx_test, "\n",
     "Achieved with {} upper clipped RUL".format(top_clip))