In [None]:
# Konstantinos Varakliotis
# Electrical and Computer Engineering, University of Thessaly
# kvarakliotis@uth.gr

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import shap
import matplotlib.pyplot as plt

# Load the cleaned dataset
file_path = "/content/sample_data/exoplanet_database_new.csv"
df = pd.read_csv(file_path)

# Remove the outlier
df = df[df['planet_temperature'] < df['planet_temperature'].max()]

# Define features (drop 'planet_temperature' which is the target)
X = df.drop(columns=['planet_temperature', 'planet_name', 'insolation_flux', 'eccentricity', 'Inclination']).values

# Define target variable
y = df['planet_temperature'].values

# Normalize features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model with the best hyperparameters
model = Sequential([
    Dense(192, activation='relu', input_shape=(X_train.shape[1],)),  # units=192
    Dropout(0.2),  # dropout_rate=0.2
    Dense(128, activation='relu'),  # units_2=128
    Dropout(0.2),  # dropout_rate_2=0.2
    Dense(1)  # Output layer
])

# Compile the model with the best learning rate
model.compile(optimizer=Adam(learning_rate=0.0009), loss='mean_squared_error', metrics=['mae'])

# Display the model summary
model.summary()

history = model.fit(X_train, y_train, epochs=2000, batch_size=32, validation_data=(X_test, y_test))
y_pred = model.predict(X_test)

# Compute evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Plot the training history
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Model Training Loss Curve")
plt.show()

# Actual vs Predicted Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='dashed')  # Ideal line
plt.xlabel("Actual Temperature")
plt.ylabel("Predicted Temperature")
plt.title("Actual vs. Predicted Planet Temperatures")
plt.show()

import joblib

# Save feature names before scaling
feature_columns = df.drop(columns=['planet_temperature', 'planet_name', 'insolation_flux' , 'eccentricity', 'Inclination']).columns
joblib.dump(feature_columns, "feature_columns_basic.pkl")

# Save the trained model
model.save("exoplanet_model_basic.h5")

# Save the scaler (used for feature normalization)
joblib.dump(scaler, "scalerbasic.pkl")

# Load new dataset
new_data = pd.read_csv("/content/sample_data/exoplanet_database_complete.csv", delimiter=';')

# Drop rows with missing values in required columns
new_data = new_data.dropna(subset=feature_columns)

# Extract only the required features (ensure correct order)
X_new = new_data[feature_columns]

# Transform using the pre-trained scaler
X_new_scaled = scaler.transform(X_new)

# Predict temperatures
y_pred = model.predict(X_new_scaled).flatten()

# Add predictions and keep original temperature for comparison
new_data["Predicted Temperature"] = y_pred

# Save results
new_data.to_csv("predicted_exoplanet_temperatures.csv", index=False)

# --- Optional: If you have actual temperatures in the new data, you can also evaluate the predictions ---
if 'planet_temperature' in new_data.columns:
    # Identify valid rows where the actual temperature is available
    valid_indices = new_data['planet_temperature'].notna()

    if valid_indices.sum() > 0:
        # Get the actual temperatures and corresponding predicted temperatures
        y_actual = new_data.loc[valid_indices, 'planet_temperature'].values
        y_pred_filtered = new_data.loc[valid_indices, 'Predicted Temperature'].values

        # Compute evaluation metrics
        from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
        mae = mean_absolute_error(y_actual, y_pred_filtered)
        rmse = np.sqrt(mean_squared_error(y_actual, y_pred_filtered))
        r2 = r2_score(y_actual, y_pred_filtered)

        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
        print(f"R² Score: {r2:.2f}")

    else:
        print("Warning: No valid actual planet temperature values for comparison.")
else:
    print("Warning: 'planet_temperature' column not found in the dataset.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 878689.8750 - mae: 848.9766 - val_loss: 841024.0000 - val_mae: 833.5737
Epoch 2/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 909450.0625 - mae: 863.0944 - val_loss: 778555.0000 - val_mae: 797.7267
Epoch 3/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 745114.4375 - mae: 770.9628 - val_loss: 604332.2500 - val_mae: 692.8940
Epoch 4/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 550766.8125 - mae: 655.0004 - val_loss: 335191.5000 - val_mae: 499.7420
Epoch 5/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 289543.9062 - mae: 458.3610 - val_loss: 140715.9062 - val_mae: 295.4670
Epoch 6/2000
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 117786.9531 - mae: 268.0225 - val_loss: 94860.8594 - val_mae: 206.1139
Epoch

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import joblib

# Load the trained model
model = load_model("exoplanet_model_basic.h5")
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])  # Compile with correct optimizer and loss

# Load the trained scaler
scaler = joblib.load("scalerbasic.pkl")

# Load the feature columns used during training
feature_columns = joblib.load("feature_columns_basic.pkl")

# Load the new dataset (the one you want to make predictions on)
new_data = pd.read_csv("/content/solar_system.csv", delimiter=';')

# Check the columns in the dataset to ensure they match
print("Columns in the new dataset:")
print(new_data.columns)

# Drop rows with missing values in the required columns (those used in training)
new_data = new_data.dropna(subset=feature_columns)

# Extract the relevant features from the new dataset
X_new = new_data[feature_columns]

# Normalize using the pre-trained scaler (the scaler that was fitted on the training dataset)
X_new_scaled = scaler.transform(X_new)

# Predict the planet temperatures using the pre-trained model
y_pred = model.predict(X_new_scaled).flatten()

# Add the predictions to the original dataframe for comparison
new_data["Predicted Temperature"] = y_pred

# Save the results to a CSV file
new_data.to_csv("predicted_exoplanet_temperatures.csv", index=False)

# Display the first few rows of the dataframe with the predicted temperatures
print(new_data[['planet_temperature', 'Predicted Temperature']].head())

# --- Optional: If you have actual temperatures in the new data, you can also evaluate the predictions ---
if 'planet_temperature' in new_data.columns:
    # Identify valid rows where the actual temperature is available
    valid_indices = new_data['planet_temperature'].notna()

    if valid_indices.sum() > 0:
        # Get the actual temperatures and corresponding predicted temperatures
        y_actual = new_data.loc[valid_indices, 'planet_temperature'].values
        y_pred_filtered = new_data.loc[valid_indices, 'Predicted Temperature'].values

        # Compute evaluation metrics
        from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
        mae = mean_absolute_error(y_actual, y_pred_filtered)
        rmse = np.sqrt(mean_squared_error(y_actual, y_pred_filtered))
        r2 = r2_score(y_actual, y_pred_filtered)

        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
        print(f"R² Score: {r2:.2f}")

    else:
        print("Warning: No valid actual planet temperature values for comparison.")
else:
    print("Warning: 'planet_temperature' column not found in the dataset.")




Columns in the new dataset:
Index(['planet_name', 'number_of_stars', 'orbital_period',
       'orbit_semi_major_axis', 'planet_radius', 'Inclination', 'eccentricity',
       'insolation_flux', 'planet_temperature', 'star_temperature',
       'star_radius', 'star_mass'],
      dtype='object')
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
   planet_temperature  Predicted Temperature
0               440.0             415.941254
1                 NaN             297.500916
2               288.0             253.098907
3               208.0             198.735107
4               163.0             367.831421
Mean Absolute Error (MAE): 6032.57
Root Mean Squared Error (RMSE): 13317.48
R² Score: -12112.53


