In [2]:
import os
import pandas as pd

BASE_DIR = os.getcwd()  

raw_path = os.path.join(BASE_DIR, "data", "raw", "train_FD001.txt")
processed_path = os.path.join(BASE_DIR, "data", "processed", "fd001_processed.csv")
db_path = os.path.join(BASE_DIR, "database", "nasa_engines.db")
metrics_path = os.path.join(BASE_DIR, "artifacts", "metrics", "rf_performance.csv")

df = pd.read_csv(raw_path, sep=r"\s+", header=None)

cols = ["unit", "cycle"] + [f"op_setting_{i}" for i in range(1,4)] + [f"sensor_{i}" for i in range(1,22)]
df.columns = cols

df.head()



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\kewtiiiii\\Desktop\\Projects\\engine-cycle-forecast\\scripts\\data\\raw\\train_FD001.txt'

In [None]:
print("Number of engines:", df["unit"].nunique())
print("\nCycles per engine (first 10):")
print(df.groupby("unit")["cycle"].max().head(10))

df.describe()


In [None]:
print("Missing values per column:")
print(df.isnull().sum())

constant_cols = [c for c in df.columns if df[c].nunique() == 1]
print("\nConstant columns:", constant_cols)


In [None]:
max_cycles = df.groupby("unit")["cycle"].max().reset_index()
max_cycles.columns = ["unit", "max_cycle"]

df = df.merge(max_cycles, on="unit")
df["rul"] = df["max_cycle"] - df["cycle"]
df.drop(columns=["max_cycle"], inplace=True)

df.to_csv(processed_path, index=False)

conn = sqlite3.connect(db_path)
df.to_sql("fd001_processed", conn, if_exists="replace", index=False)
conn.close()

print("Preprocessing complete.")


In [None]:
plt.figure(figsize=(8,5))
df["rul"].hist(bins=50)
plt.xlabel("Remaining Useful Life (cycles)")
plt.ylabel("Count")
plt.title("RUL Distribution")
plt.show()


In [None]:
engine1 = df[df["unit"] == 1]

plt.figure(figsize=(12,6))
for i in range(1,6):
    plt.plot(engine1["cycle"], engine1[f"sensor_{i}"], label=f"Sensor {i}")

plt.xlabel("Cycle")
plt.ylabel("Sensor Reading")
plt.title("Sensor Degradation Patterns â€“ Engine 1")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(df.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
import sqlite3
conn = sqlite3.connect("nasa_engines.db")
df.to_sql("fd001_processed", conn, if_exists="replace", index=False)

query = "SELECT unit, AVG(sensor_1) AS avg_sensor1 FROM fd001_processed GROUP BY unit"
result = pd.read_sql(query, conn)
print(result.head())


In [None]:
features = ["cycle", "op_setting_1", "op_setting_2", "op_setting_3"] + \
           [f"sensor_{i}" for i in range(1,22) if f"sensor_{i}" not in constant_cols]

X = df[features]
y = df["rul"]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}, R2: {r2:.2f}")

#MSE: 1282.02, R2: 0.72


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

rand_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,              # ONLY 12 models instead of 324
    cv=3,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=2
)

rand_search.fit(X_train, y_train)

best_rf = rand_search.best_estimator_
y_pred = best_rf.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Optimized RF MSE: {mse:.2f}")
print(f"Optimized RF R2: {r2:.2f}")

#Optimized RF MSE: 1257.60
#Optimized RF R2: 0.73


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

feat_imp = pd.Series(best_rf.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Random Forest Feature Importances")
plt.show()


In [None]:
import joblib
joblib.dump(best_rf, "rf_rul_model.pkl")


In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10,6))
plt.scatter(y_test, residuals, alpha=0.5)
plt.axhline(0, linestyle='--')
plt.xlabel("Actual RUL")
plt.ylabel("Residual (Actual - Predicted)")
plt.title("Residual Analysis")
plt.show()
