In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib


In [2]:
# Load the dataset
df = pd.read_excel('Health_Score_dataset.xlsm', engine='openpyxl')
df.head()

Unnamed: 0,Flight ID,Airline Name,Aircraft Model,Engine Temperature (°C),Engine Vibration Levels (mm/s),Oil Pressure (Mpa),Hydraulic System Pressure (psi),Electrical System Voltage (V),Oil Temperature (°C),Health Score
0,FL0001,SpiceJet,A320,1022,5,3.460175,4912,26.67871,37.139954,95.714286
1,FL0002,GoAir,A321Neo,959,8,3.683843,5096,27.677461,53.67472,92.142857
2,FL0003,IndiGo,A321Neo,810,3,3.542163,4957,26.816504,33.303377,95.714286
3,FL0004,Air India,A321Neo,955,7,3.062497,4974,24.830444,33.156857,88.571429
4,FL0005,GoAir,A321Neo,938,7,3.526306,5981,26.982433,67.429764,92.857143


In [3]:
df.describe()

Unnamed: 0,Engine Temperature (°C),Engine Vibration Levels (mm/s),Oil Pressure (Mpa),Hydraulic System Pressure (psi),Electrical System Voltage (V),Oil Temperature (°C),Health Score
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,926.773,4.291333,3.622693,4908.497,26.15865,68.768484,94.083333
std,109.184559,1.630136,0.582143,601.180433,1.406013,20.650583,4.538114
min,750.0,2.0,0.708185,3000.0,23.00037,30.050691,73.571429
25%,836.0,3.0,3.284412,4424.0,25.26538,54.04665,90.714286
50%,920.0,4.0,3.591612,4887.5,26.228801,69.939954,95.714286
75%,1008.0,5.0,3.895079,5378.25,27.131549,85.050241,99.285714
max,1199.0,9.0,4.996968,5999.0,31.993492,129.40832,99.285714


In [4]:
# Label encode the Aircraft Model column
label_encoder = LabelEncoder()
df['Aircraft Model'] = label_encoder.fit_transform(df['Aircraft Model'])

# Remove Flight ID and Airline columns
X = df.drop(columns=['Health Score', 'Flight ID', 'Airline Name'])

# Target variable
y = df['Health Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate models
rf_train_predictions = rf_model.predict(X_train)
rf_test_predictions = rf_model.predict(X_test)

rf_train_rmse = mean_squared_error(y_train, rf_train_predictions, squared=False)
rf_test_rmse = mean_squared_error(y_test, rf_test_predictions, squared=False)

print("Random Forest Model - Train RMSE:", rf_train_rmse)
print("Random Forest Model - Test RMSE:", rf_test_rmse)



Random Forest Model - Train RMSE: 0.3666775180057871
Random Forest Model - Test RMSE: 1.0836578660944136


In [6]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
rf_test_accuracy = rf_model.score(X_test, y_test)
print("Random Forest Model - Test Accuracy:", rf_test_accuracy)


Random Forest Model - Test Accuracy: 0.9486317439044554


In [8]:
# Save models as pickle files
joblib.dump(rf_model, 'rf_model_health.pkl')
joblib.dump(label_encoder, 'label_encoder_health.pkl')


['label_encoder_health.pkl']

In [16]:
import numpy as np
import joblib

# Load the label encoder
label_encoder = joblib.load('label_encoder_health.pkl')

# Assuming X_new contains the input data for prediction
X_new = np.array([
    [774, 5, 3.02, 5778, 24.07, 75.23],  # A321Neo data point
    [905, 5, 4.83, 4453, 25.10, 73.83]   # A320 data point
])

# Convert aircraft model names to numerical labels
aircraft_models = ['A321Neo', 'A320']
encoded_aircraft_models = label_encoder.transform(aircraft_models)

# Concatenate the encoded aircraft model labels with the numerical features
X_new_encoded = np.hstack((encoded_aircraft_models.reshape(-1, 1), X_new))

# Predict using Random Forest model
rf_predictions = rf_model.predict(X_new_encoded)

print("Predictions using Random Forest model:", rf_predictions)


Predictions using Random Forest model: [92.85714286 93.29285714]


