In [None]:
import pandas as pd
import numpy as np
from kan import KAN, create_dataset
import torch
import matplotlib.pyplot as plt
import optuna

# File paths
galaxy_properties_path = "/Users/itamargoshen/Desktop/galactic_properties_data.txt"
omega_matter_path = "/Users/itamargoshen/Desktop/omega_matter_data.txt"
offsets_lengths_path = "/Users/itamargoshen/Desktop/offsets_lengths.txt"

# Load galactic properties data
galactic_properties_cols = [
    "gas_mass", "stellar_mass", "black_hole_mass", "total_mass", "Vmax", 
    "velocity_dispersion", "gas_metallicity", "stars_metallicity", 
    "star_formation_rate", "spin", "peculiar_velocity", "stellar_radius", 
    "total_radius", "Vmax_radius", "U", "K", "g"
]
galactic_properties_data = pd.read_csv(galaxy_properties_path, sep='\s+', comment='#', names=galactic_properties_cols)

# Load omega matter data
omega_matter_columns = ["Omega_Matter_of_Simulation", "Doesn't_matter_rn1", "Doesn't_matter_rn2", "Doesn't_matter_rn3", "Doesn't_matter_rn4", "Doesn't_matter_rn5"]
omega_matter_data = pd.read_csv(omega_matter_path, sep='\s+', header=None, names=omega_matter_columns)

# Load offsets and lengths data
offsets_lengths_data = pd.read_csv(offsets_lengths_path, sep='\s+', comment='#', header=None)
offsets_lengths_data.columns = ["offset", "length"]

# Initialize an empty list to store the arrays
combined_data = []

# Total rows in galactic_properties_data
total_rows = len(galactic_properties_data) 

# Iterate through each offset and length pair
for offsets_index, row in offsets_lengths_data.iterrows():
    offset = row['offset']
    length = row['length']
    
    # Extract the relevant rows from galactic_properties_data
    data_slice = galactic_properties_data.iloc[offset:offset + length].to_numpy()
    
    # Get the Omega_Matter_of_Simulation value for the current offsets_index
    omega_matter_value = omega_matter_data.iloc[offsets_index]["Omega_Matter_of_Simulation"]
    
    # Create the first column: row indices from galactic_properties_data
    indices = np.arange(offset, offset + length).reshape(-1, 1)
    
    # Create the second column: the row index of offsets_lengths_data
    offsets_indices = np.full((length, 1), offsets_index)
    
    # Create the third column: Omega_Matter_of_Simulation value
    omega_matter_column = np.full((length, 1), omega_matter_value)
    
    # Combine the indices and data slices into one array
    data_with_indices = np.hstack((indices, offsets_indices, omega_matter_column, data_slice))
    
    # Append the data with indices to the combined_data list
    combined_data.append(data_with_indices)

# Convert the list of arrays into one big numpy array
combined_data_array = np.vstack(combined_data)

print(combined_data_array) #Format: galaxy number| simulation number| omega matter| galaxy properties (see line 10)

In [None]:
from kan import KAN
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Extract features and labels
features = combined_data_array[:, 3:]  # all columns from the 4th onward (features)
labels = combined_data_array[:, 2]     # omega matter (3rd column)

# Split data
train_features, temp_features, train_labels, temp_labels = train_test_split(
    features, labels, test_size=0.3, random_state=42
)
validate_features, test_features, validate_labels, test_labels = train_test_split(
    temp_features, temp_labels, test_size=0.5, random_state=42
)

# Convert to PyTorch tensors once and use them directly
def to_tensor(x):
    return torch.tensor(x, dtype=torch.float32)

train_features = to_tensor(train_features)
train_labels = to_tensor(train_labels)
validate_features = to_tensor(validate_features)
validate_labels = to_tensor(validate_labels)
test_features = to_tensor(test_features)
test_labels = to_tensor(test_labels)

# Dataset dictionary for KAN
dataset = {
    'train_input': train_features,
    'train_label': train_labels,
    'validate_input': validate_features,
    'validate_label': validate_labels,
    'test_input': test_features,
    'test_label': test_labels
}

# Initialize KAN model with appropriate parameters
kan_model = KAN(width=[features.shape[1], 50, 1],  
                grid=15,                         
                k=15,                            
                seed=42)                         
print('works')

# Train the model with more efficient settings
kan_model.train(dataset, opt='LBFGS', steps=100, log=100)

print('works')

# Prediction
with torch.no_grad():
    y_predicted = kan_model.forward(validate_features).squeeze().cpu().numpy()

# Calculate residuals
residuals = validate_labels.numpy() - y_predicted

# Plot predicted vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(validate_labels.numpy(), y_predicted, label='Predicted vs Actual')
plt.xlabel('Actual Omega Matter')
plt.ylabel('Predicted Omega Matter')
plt.title('Prediction using KAN')
plt.grid(True)
plt.legend()
plt.show()

# Plot residuals
plt.figure(figsize=(8, 6))
plt.scatter(validate_labels.numpy(), residuals, label='Residuals')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Actual Omega Matter')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.legend()
plt.show()

# Plot error distribution
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Error Distribution')
plt.grid(True)
plt.show()

# Compute and print metrics
mae = mean_absolute_error(validate_labels.numpy(), y_predicted)
mse = mean_squared_error(validate_labels.numpy(), y_predicted)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")

works


description:   0%|                                                          | 0/100 [00:00<?, ?it/s]