In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 1. Data Loading and Exploration
file_path = '/content/drive/MyDrive/code/Health.csv'
dataset = pd.read_csv(file_path)



In [4]:

print("First five rows of the dataset:")
print(dataset.head())


First five rows of the dataset:
   Hydrogen  Oxygen  Nitrogen  Methane   CO   CO2  Ethylene  Ethane  \
0      2845    5860     27842     7406   32  1344     16684    5467   
1     12886      61     25041      877   83   864         4     305   
2      2820   16400     56300      144  257  1080       206      11   
3      1099      70     37520      545  184  1402         6     230   
4      3210    3570     47900      160  360  2130         4      43   

   Acethylene  DBDS  Power factor  Interfacial V  Dielectric rigidity  \
0           7  19.0          1.00             45                   55   
1           0  45.0          1.00             45                   55   
2        2190   1.0          1.00             39                   52   
3           0  87.0          4.58             33                   49   
4           4   1.0          0.77             44                   55   

   Water content  Health index  Life expectation  
0              0          95.2              19.0  


In [5]:

print("\nMissing values in each column:")
print(dataset.isnull().sum())


Missing values in each column:
Hydrogen               0
Oxygen                 0
Nitrogen               0
Methane                0
CO                     0
CO2                    0
Ethylene               0
Ethane                 0
Acethylene             0
DBDS                   0
Power factor           0
Interfacial V          0
Dielectric rigidity    0
Water content          0
Health index           0
Life expectation       0
dtype: int64


In [6]:
# 2. Data Preprocessing

X = dataset.drop(columns=["Life expectation", "Health index"])
y = dataset[["Life expectation", "Health index"]]


In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
!pip install catboost

from catboost import CatBoostRegressor

catboost_model = MultiOutputRegressor(CatBoostRegressor(silent=True, random_state=42))

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [11]:

catboost_model.fit(X_train_scaled, y_train)


In [12]:

y_pred_catboost = catboost_model.predict(X_test_scaled)

y_pred_life_expectation_cat = y_pred_catboost[:, 0]
y_pred_health_index_cat = y_pred_catboost[:, 1]



In [13]:

mse_life_cat = mean_squared_error(y_test["Life expectation"], y_pred_life_expectation_cat)
r2_life_cat = r2_score(y_test["Life expectation"], y_pred_life_expectation_cat)

mse_health_cat = mean_squared_error(y_test["Health index"], y_pred_health_index_cat)
r2_health_cat = r2_score(y_test["Health index"], y_pred_health_index_cat)

print(f"CatBoost - Life Expectation: MSE: {mse_life_cat}, R²: {r2_life_cat}")
print(f"CatBoost - Health Index: MSE: {mse_health_cat}, R²: {r2_health_cat}")


CatBoost - Life Expectation: MSE: 49.75709535552215, R²: 0.8249352209770027
CatBoost - Health Index: MSE: 99.91889104549108, R²: 0.7082571888789283


In [14]:

from xgboost import XGBRegressor

xgboost_model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', random_state=42))



In [15]:

xgboost_model.fit(X_train_scaled, y_train)



In [16]:

y_pred_xgboost = xgboost_model.predict(X_test_scaled)

In [17]:

y_pred_life_expectation_xgb = y_pred_xgboost[:, 0]
y_pred_health_index_xgb = y_pred_xgboost[:, 1]



In [18]:
# Evaluate the XGBoost model
mse_life_xgb = mean_squared_error(y_test["Life expectation"], y_pred_life_expectation_xgb)
r2_life_xgb = r2_score(y_test["Life expectation"], y_pred_life_expectation_xgb)

mse_health_xgb = mean_squared_error(y_test["Health index"], y_pred_health_index_xgb)
r2_health_xgb = r2_score(y_test["Health index"], y_pred_health_index_xgb)

print(f"XGBoost - Life Expectation: MSE: {mse_life_xgb}, R²: {r2_life_xgb}")
print(f"XGBoost - Health Index: MSE: {mse_health_xgb}, R²: {r2_health_xgb}")


XGBoost - Life Expectation: MSE: 54.886273158926265, R²: 0.8068887821262882
XGBoost - Health Index: MSE: 114.7469582031941, R²: 0.6649622528481552


In [19]:

multi_task_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
multi_task_model.fit(X_train_scaled, y_train)


In [20]:

y_pred_multi = multi_task_model.predict(X_test_scaled)

y_pred_life_expectation = y_pred_multi[:, 0]
y_pred_health_index = y_pred_multi[:, 1]



In [21]:

mse_life = mean_squared_error(y_test["Life expectation"], y_pred_life_expectation)
r2_life = r2_score(y_test["Life expectation"], y_pred_life_expectation)

mse_health = mean_squared_error(y_test["Health index"], y_pred_health_index)
r2_health = r2_score(y_test["Health index"], y_pred_health_index)

print(f"Life Expectation - Mean Squared Error (MSE): {mse_life}, R²: {r2_life}")
print(f"Health Index - Mean Squared Error (MSE): {mse_health}, R²: {r2_health}")


Life Expectation - Mean Squared Error (MSE): 48.70011363829786, R²: 0.8286540930179664
Health Index - Mean Squared Error (MSE): 88.23068603191491, R²: 0.7423843669524683


In [22]:
# 4. Prediction Function
def predict_health_and_lifespan(model, scaler, input_data):
    """
    Predicts health index and life span of a transformer based on input data.

    Parameters:
    - model: Trained multitask regression model
    - scaler: Scaler used for data normalization
    - input_data: Array-like data of feature values [Hydrogen, Oxygen, Nitrogen, ..., Water content]

    Returns:
    - Predicted health index and life span
    """
    # Scale the input data
    input_data_scaled = scaler.transform([input_data])
    # Make predictions
    predictions = model.predict(input_data_scaled)

    predicted_life_expectation = predictions[0][0]
    predicted_health_index = predictions[0][1]

    return predicted_health_index, predicted_life_expectation




In [23]:
# 5. Test Case - Example input data (values for all features in the order they appear in the dataset)
# For example, the feature values could be [Hydrogen, Oxygen, Nitrogen, Methane, CO, CO2, Ethylene, Ethane, Acethylene, DBDS, Power factor, Interfacial V, Dielectric rigidity, Water content]
test_input = [3000, 5000, 28000, 7000, 35, 1300, 16000, 5000, 10, 20, 1.0, 45, 55, 5]


In [24]:
# Predict health index and life span for the test input
predicted_health_index, predicted_life_expectation = predict_health_and_lifespan(multi_task_model, scaler, test_input)

print(f"Predicted Health Index: {predicted_health_index}")
print(f"Predicted Life Expectation: {predicted_life_expectation}")



Predicted Health Index: 64.91700000000003
Predicted Life Expectation: 16.799000000000007




In [25]:
# 6. Multiple Test Cases
test_cases = [
    [3000, 5000, 28000, 7000, 35, 1300, 16000, 5000, 10, 20, 1.0, 45, 55, 5],
    [5000, 7000, 29000, 7500, 50, 1400, 17000, 5500, 15, 30, 1.5, 40, 50, 10],
    [1000, 2000, 25000, 5000, 25, 1100, 14000, 4500, 5, 10, 0.9, 50, 60, 3]
]


In [26]:
# Iterate through test cases
for i, test_input in enumerate(test_cases, 1):
    predicted_health_index, predicted_life_expectation = predict_health_and_lifespan(multi_task_model, scaler, test_input)
    print(f"Test Case {i}:")
    print(f"Input Data: {test_input}")
    print(f"Predicted Health Index: {predicted_health_index}")
    print(f"Predicted Life Expectation: {predicted_life_expectation}")
    print()

Test Case 1:
Input Data: [3000, 5000, 28000, 7000, 35, 1300, 16000, 5000, 10, 20, 1.0, 45, 55, 5]
Predicted Health Index: 64.91700000000003
Predicted Life Expectation: 16.799000000000007

Test Case 2:
Input Data: [5000, 7000, 29000, 7500, 50, 1400, 17000, 5500, 15, 30, 1.5, 40, 50, 10]
Predicted Health Index: 66.69700000000003
Predicted Life Expectation: 18.073999999999987

Test Case 3:
Input Data: [1000, 2000, 25000, 5000, 25, 1100, 14000, 4500, 5, 10, 0.9, 50, 60, 3]
Predicted Health Index: 60.33900000000003
Predicted Life Expectation: 21.95000000000001





In [27]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense


In [28]:

X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


In [29]:
# RNN Model
rnn_model = Sequential()
rnn_model.add(SimpleRNN(50, activation='relu', input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
rnn_model.add(Dense(2))  # Two outputs (Life Expectation, Health Index)


  super().__init__(**kwargs)


In [30]:

rnn_model.compile(optimizer='adam', loss='mse')


In [31]:

rnn_model.fit(X_train_rnn, y_train, epochs=50, batch_size=16, verbose=0)

y_pred_rnn = rnn_model.predict(X_test_rnn)



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step


In [32]:

y_pred_life_expectation_rnn = y_pred_rnn[:, 0]
y_pred_health_index_rnn = y_pred_rnn[:, 1]


In [None]:
# Evaluate the RNN model
mse_life_rnn = mean_squared_error(y_test["Life expectation"], y_pred_life_expectation_rnn)
r2_life_rnn = r2_score(y_test["Life expectation"], y_pred_life_expectation_rnn)

mse_health_rnn = mean_squared_error(y_test["Health index"], y_pred_health_index_rnn)
r2_health_rnn = r2_score(y_test["Health index"], y_pred_health_index_rnn)

print(f"RNN - Life Expectation: MSE: {mse_life_rnn}, R²: {r2_life_rnn}")
print(f"RNN - Health Index: MSE: {mse_health_rnn}, R²: {r2_health_rnn}")


RNN - Life Expectation: MSE: 137.57835356623738, R²: 0.5159459391001585
RNN - Health Index: MSE: 183.97684391927652, R²: 0.462825086782081


In [33]:
def predict_health_and_lifespan(model, scaler, input_data, is_rnn=False):
    """
    Predicts health index and life span of a transformer based on input data.

    Parameters:
    - model: Trained multitask regression model
    - scaler: Scaler used for data normalization
    - input_data: Array-like data of feature values [Hydrogen, Oxygen, Nitrogen, ..., Water content]
    - is_rnn: Boolean flag indicating if the model is an RNN (to handle reshaping)

    Returns:
    - Predicted health index and life span
    """

    input_data_scaled = scaler.transform([input_data])

    if is_rnn:
        input_data_scaled = input_data_scaled.reshape((input_data_scaled.shape[0], 1, input_data_scaled.shape[1]))


    predictions = model.predict(input_data_scaled)

    predicted_life_expectation = predictions[0][0]
    predicted_health_index = predictions[0][1]

    return predicted_health_index, predicted_life_expectation


In [34]:

test_input = [3000, 5000, 28000, 7000, 35, 1300, 16000, 5000, 10, 20, 1.0, 45, 55, 5]


In [35]:

predicted_health_index_cat, predicted_life_expectation_cat = predict_health_and_lifespan(catboost_model, scaler, test_input)
print(f"CatBoost - Predicted Health Index: {predicted_health_index_cat}, Predicted Life Expectation: {predicted_life_expectation_cat}")


CatBoost - Predicted Health Index: 51.56263227142033, Predicted Life Expectation: 15.466972666217863




In [36]:

predicted_health_index_xgb, predicted_life_expectation_xgb = predict_health_and_lifespan(xgboost_model, scaler, test_input)
print(f"XGBoost - Predicted Health Index: {predicted_health_index_xgb}, Predicted Life Expectation: {predicted_life_expectation_xgb}")



XGBoost - Predicted Health Index: 53.34952926635742, Predicted Life Expectation: 15.795161247253418




In [37]:

predicted_health_index_rnn, predicted_life_expectation_rnn = predict_health_and_lifespan(rnn_model, scaler, test_input, is_rnn=True)
print(f"RNN - Predicted Health Index: {predicted_health_index_rnn}, Predicted Life Expectation: {predicted_life_expectation_rnn}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
RNN - Predicted Health Index: 96.3178939819336, Predicted Life Expectation: 28.551868438720703


In [38]:
xgboost_model.score(X_train_scaled, y_train)

0.9999983310699463

In [39]:
catboost_model.score(X_train_scaled, y_train)

0.9942144999255853

In [40]:
# Convert data into a pickle file
import pickle
pickle_file_path = 'data_output.pkl'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(xgboost_model, f)
print(f'Data saved as {pickle_file_path}')


Data saved as data_output.pkl


In [41]:
import pickle
from sklearn.preprocessing import StandardScaler  # or whatever scaler you used

# Assuming 'scaler' is your trained scaler
scaler = StandardScaler()

# After training and fitting the scaler to your data
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [42]:
# Save CatBoost model
with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)

# Save XGBoost model
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgboost_model, f)

# Save RNN model
rnn_model.save('rnn_model.h5')

# Save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)




In [43]:
# Make sure the scaler is fitted to the training data
scaler = StandardScaler()
scaler.fit(X_train)  # Fit the scaler with training data

# Then transform the training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
