In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Define the base path
base_path = r'C:\Users\pazol\Programms\Supernovae'

def load_supernova_data(base_path):
    """Load supernova data from CSV files and combine into a single DataFrame"""
    data_frames = []
    
    for supernova_type in ['Type II', 'Type IIP', 'Type Ia', 'Type Ib', 'Type Ic']:
        folder_path = os.path.join(base_path, supernova_type, 'CSV Files')
        
        if not os.path.exists(folder_path):
            continue
            
        for filename in os.listdir(folder_path):
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)
                try:
                    df = pd.read_csv(file_path)
                    df['SN_Type'] = supernova_type  # Add type label
                    data_frames.append(df)
                except Exception as e:
                    print(f"Error reading {filename}: {e}")
    
    if not data_frames:
        raise ValueError("No data files found in the specified directories")
    
    return pd.concat(data_frames, ignore_index=True)

# Load the data
try:
    sn_data = load_supernova_data(base_path)
except Exception as e:
    print(f"Error loading data: {e}")
    # For demonstration, we'll create a sample dataset with the expected columns
    print("Creating sample data for demonstration purposes...")
    sn_data = pd.DataFrame({
        'Gname': ['NGC1234', 'IC5678', 'UGC9012', 'NGC3456', 'MCG7890'],
        'Morph_Type': ['E', 'S0', 'Sc', 'Sb', 'Irr'],
        'T_Type': [-5, -2, 5, 3, 10],
        'Bmag': [12.5, 13.2, 14.8, 15.1, 16.3],
        'RA_Offset': [15.2, 8.7, 22.1, 5.3, 30.5],
        'Dec_Offset': [10.5, 6.2, 18.9, 4.1, 25.7],
        'Maxmag': [14.2, 15.8, 16.5, 17.1, 18.0],
        'Band_Maxmag': ['B', 'V', 'R', 'I', 'B'],
        'Radial_Velocity': [2500, 1800, 3200, 4200, 1500],
        'SN_Type': ['Type Ia', 'Type Ia', 'Type II', 'Type IIP', 'Type Ib']
    })

# Data preprocessing
def preprocess_data(df):
    """Preprocess the supernova data for machine learning"""
    # Select relevant features based on our theoretical analysis
    features = df[['Gname', 'Morph_Type', 'T_Type', 'Bmag', 
                  'RA_Offset', 'Dec_Offset', 'Maxmag', 
                  'Band_Maxmag', 'Radial_Velocity', 'SN_Type']].copy()
    
    # Clean data
    features.dropna(inplace=True)
    
    # Convert SN types to broader categories if needed
    # Here we'll keep them as is, but you could group them
    # e.g., features['SN_Type'] = features['SN_Type'].replace({'Type IIP': 'Type II'})
    
    # Calculate additional features
    features['Total_Offset'] = np.sqrt(features['RA_Offset']**2 + features['Dec_Offset']**2)
    
    return features

processed_data = preprocess_data(sn_data)

# Prepare features and target
X = processed_data.drop('SN_Type', axis=1)
y = processed_data['SN_Type']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing pipeline
numeric_features = ['T_Type', 'Bmag', 'RA_Offset', 'Dec_Offset', 
                   'Maxmag', 'Radial_Velocity', 'Total_Offset']
categorical_features = ['Gname', 'Morph_Type', 'Band_Maxmag']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create and train the model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42,
        max_depth=10,
        min_samples_split=5
    ))
])

model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importance analysis
feature_names = (numeric_features + 
                list(model.named_steps['preprocessor']
                    .named_transformers_['cat']
                    .get_feature_names_out(categorical_features)))

importances = model.named_steps['classifier'].feature_importances_
sorted_idx = importances.argsort()[::-1]

print("\nTop 10 Important Features:")
for i in sorted_idx[:10]:
    print(f"{feature_names[i]}: {importances[i]:.4f}")

# Visualization
def plot_confusion_matrix(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

plot_confusion_matrix(y_test, y_pred, model.classes_)