In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
data = np.load('Galaxy Catalogue.npy')

In [None]:
df = pd.DataFrame(data)

In [None]:
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()

Dataset shape: (780, 17)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   u-g         780 non-null    float64
 1   g-r         780 non-null    float64
 2   r-i         780 non-null    float64
 3   i-z         780 non-null    float64
 4   ecc         780 non-null    float64
 5   m4_u        780 non-null    float64
 6   m4_g        780 non-null    float64
 7   m4_r        780 non-null    float64
 8   m4_i        780 non-null    float64
 9   m4_z        780 non-null    float64
 10  petroR50_u  780 non-null    float64
 11  petroR50_r  780 non-null    float64
 12  petroR50_z  780 non-null    float64
 13  petroR90_u  780 non-null    float64
 14  petroR90_r  780 non-null    float64
 15  petroR90_z  780 non-null    float64
 16  class       780 non-null    object 
dtypes: float64(16), object(1)
memory usage: 103.7+ KB


In [None]:
print("\nBasic statistics:")
print(df.describe())


Basic statistics:
              u-g         g-r         r-i         i-z         ecc  \
count  780.000000  780.000000  780.000000  780.000000  780.000000   
mean     1.633556    0.751285    0.390027    0.256630    0.713827   
std      0.680241    0.333986    0.094902    0.434369    0.171010   
min     -8.202350   -4.708600   -0.630590   -7.698340    0.226894   
25%      1.333080    0.620432    0.353687    0.227997    0.592427   
50%      1.701755    0.789610    0.400985    0.294780    0.744386   
75%      1.914082    0.905282    0.430363    0.326920    0.852673   
max     10.624040    1.539170    1.390840    4.879330    0.999812   

              m4_u         m4_g         m4_r         m4_i         m4_z  \
count   780.000000   780.000000   780.000000   780.000000   780.000000   
mean    -36.228336   -10.499354   -23.280043   -36.084795   -48.904677   
std     619.454396   358.104842   506.113635   619.462662   714.833597   
min   -9999.000000 -9999.000000 -9999.000000 -9999.000000 -9999

In [None]:
print(df.head())

       u-g      g-r      r-i      i-z       ecc      m4_u      m4_g      m4_r  \
0  1.85765  0.67158  0.42310  0.30610  0.585428  2.251946  2.339849  2.380652   
1  1.74259  0.86085  0.44927  0.28851  0.749812  2.031566  2.084156  2.092272   
2  2.10697  0.96710  0.56810  0.33747  0.630089  1.992887  2.450746  2.473116   
3  1.22840  0.60446  0.39049  0.28913  0.668999  2.056115  2.182252  2.309510   
4  1.80792  0.78124  0.45528  0.35067  0.500170  2.259167  2.194723  2.264567   

       m4_i      m4_z  petroR50_u  petroR50_r  petroR50_z  petroR90_u  \
0  2.359738  2.395528    3.095123    3.818919    3.826230    5.174814   
1  2.124075  2.133154    1.430436    1.422533    1.385727    3.732712   
2  2.465324  2.438683    3.099957    3.623704    3.463300    9.110857   
3  2.391414  2.472397    3.627267    4.063471    3.717708    9.580675   
4  2.334713  2.352568    3.256751    3.153533    2.254511    9.206746   

   petroR90_r  petroR90_z   class  
0    8.263009   11.477340  merger  
1 

In [None]:
print(df.isnull().sum())

u-g           0
g-r           0
r-i           0
i-z           0
ecc           0
m4_u          0
m4_g          0
m4_r          0
m4_i          0
m4_z          0
petroR50_u    0
petroR50_r    0
petroR50_z    0
petroR90_u    0
petroR90_r    0
petroR90_z    0
class         0
dtype: int64


# **Understanding about the Colors**

In [None]:
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots


colors = px.colors.sequential.Plasma_r
fig = px.scatter(df, x='g-r', y='u-g', color='class',
                 color_discrete_sequence=colors,
                 title='Galactic Color-Color Diagram with u-g vs g-r',
                 labels={'g-r': 'g-r color', 'u-g': 'u-g color'},
                 hover_data=['r-i', 'i-z'])

fig.update_layout(
    plot_bgcolor='rgba(0, 0, 0, 0.95)',
    paper_bgcolor='rgba(0, 0, 0, 0.95)',
    font=dict(color='white'),
    title=dict(font=dict(size=24, color='white')),
    legend=dict(
        title=dict(text='Galaxy Class', font=dict(color='white')),
        font=dict(color='white')
    )
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)')

fig.update_traces(marker=dict(size=8, line=dict(width=1, color='white')))

fig.add_annotation(
    text="Galaxy classification based on color indices",
    xref="paper", yref="paper",
    x=0.02, y=0.98,
    showarrow=False,
    font=dict(size=12, color="white"),
    align="left",
    bgcolor="rgba(0, 0, 0, 0.5)",
    bordercolor="white",
    borderwidth=1
)

fig.show()

# **Galaxy Size vs Brightness**

In [None]:
fig = px.scatter(df, x='m4_r', y='petroR90_r', color='class',
                 color_discrete_sequence=colors,
                 title='Cosmic Dimensions: Galaxy Size vs Brightness',
                 labels={'m4_r': 'Brightness (r-band)', 'petroR90_r': 'Size (R90 in r-band)'},
                 hover_data=['ecc'])


fig.update_layout(
    plot_bgcolor='rgba(0, 0, 0, 0.95)',
    paper_bgcolor='rgba(0, 0, 0, 0.95)',
    font=dict(color='white'),
    title=dict(font=dict(size=24, color='white')),
    legend=dict(
        title=dict(text='Galaxy Class', font=dict(color='white')),
        font=dict(color='white')
    )
)

fig.update_xaxes(
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)',
    title_font=dict(size=14)
)
fig.update_yaxes(
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)',
    title_font=dict(size=14)
)


fig.update_traces(
    marker=dict(size=8, opacity=0.7, line=dict(width=1, color='rgba(255, 255, 255, 0.5)')),
    selector=dict(mode='markers')
)

fig.add_annotation(
    text="Explore the relationship between galaxy size and brightness",
    xref="paper", yref="paper",
    x=0.02, y=0.98,
    showarrow=False,
    font=dict(size=12, color="white"),
    align="left",
    bgcolor="rgba(0, 0, 0, 0.5)",
    bordercolor="white",
    borderwidth=1
)

fig.add_trace(go.Scatter(
    x=df['m4_r'],
    y=df['petroR90_r'].rolling(window=50).mean(),
    mode='lines',
    line=dict(color='rgba(255, 255, 255, 0.5)', width=2),
    name='Trend'
))

fig.show()

# **Eccentricity Distribution**

In [None]:
fig3 = px.histogram(df, x='ecc', color='class', nbins=50,
                    title='Cosmic Shapes: Distribution of Galaxy Eccentricity',
                    labels={'ecc': 'Eccentricity'},
                    marginal='box',
                    color_discrete_sequence=colors)

fig3.update_layout(
    plot_bgcolor='rgba(0, 0, 0, 0.95)',
    paper_bgcolor='rgba(0, 0, 0, 0.95)',
    font=dict(color='white'),
    title=dict(font=dict(size=24, color='white')),
    legend=dict(
        title=dict(text='Galaxy Class', font=dict(color='white')),
        font=dict(color='white')
    )
)

fig3.update_xaxes(
    title_font=dict(size=14),
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)'
)
fig3.update_yaxes(
    title_font=dict(size=14),
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)'
)

fig3.update_traces(
    opacity=0.7,
    selector=dict(type='histogram')
)

fig3.update_traces(
    line=dict(color='white'),
    selector=dict(type='box')
)

fig3.show()

# **Relationship with the Size and the Color**

In [None]:
df['concentration'] = df['petroR90_r'] / df['petroR50_r']

In [None]:
fig4 = px.scatter(df, x='g-r', y='concentration', color='class',
                  color_discrete_sequence=colors,
                  title='Relationship of Galaxy Concentration and Color',
                  labels={'g-r': 'g-r Color', 'concentration': 'Concentration (R90/R50, r-band)'},
                  hover_data=['ecc', 'm4_r'])

fig4.update_layout(
    plot_bgcolor='rgba(0, 0, 0, 0.95)',
    paper_bgcolor='rgba(0, 0, 0, 0.95)',
    font=dict(color='white'),
    title=dict(font=dict(size=24, color='white')),
    legend=dict(
        title=dict(text='Galaxy Class', font=dict(color='white')),
        font=dict(color='white')
    )
)

fig4.update_xaxes(
    title_font=dict(size=14),
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)'
)
fig4.update_yaxes(
    title_font=dict(size=14),
    showgrid=True, gridwidth=0.5, gridcolor='rgba(255, 255, 255, 0.1)'
)

fig4.update_traces(
    marker=dict(size=8, opacity=0.7, line=dict(width=1, color='rgba(255, 255, 255, 0.5)')),
    selector=dict(mode='markers')
)

for class_name in df['class'].unique():
    class_data = df[df['class'] == class_name]
    z = np.polyfit(class_data['g-r'], class_data['concentration'], 1)
    p = np.poly1d(z)
    fig4.add_trace(go.Scatter(
        x=class_data['g-r'],
        y=p(class_data['g-r']),
        mode='lines',
        name=f'Trend ({class_name})',
        line=dict(dash='dash')
    ))

fig4.show()

In [None]:
fig5 = px.scatter_3d(df, x='u-g', y='g-r', z='r-i', color='class',
                     color_discrete_sequence=colors,
                     title='Distribution of galaxies in 3D color space"',
                     labels={'u-g': 'u-g Color', 'g-r': 'g-r Color', 'r-i': 'r-i Color'})


fig5.update_layout(
    scene=dict(
        xaxis_title=dict(font=dict(size=12, color='white')),
        yaxis_title=dict(font=dict(size=12, color='white')),
        zaxis_title=dict(font=dict(size=12, color='white')),
        bgcolor='rgba(0, 0, 0, 0.95)',
    ),
    paper_bgcolor='rgba(0, 0, 0, 0.95)',
    font=dict(color='white'),
    title=dict(font=dict(size=24, color='white')),
    legend=dict(
        title=dict(text='Galaxy Class', font=dict(color='white')),
        font=dict(color='white'),
        bgcolor='rgba(0, 0, 0, 0.5)'
    )
)

fig5.update_traces(
    marker=dict(size=4, opacity=0.7, line=dict(width=0.5, color='rgba(255, 255, 255, 0.5)')),
    selector=dict(mode='markers')
)


fig5.show()

In [None]:
fig6 = px.box(df, x='class', y='petroR90_r',
              title='Distribution of Galaxy Sizes by Class',
              labels={'petroR90_r': 'Size (R90 in r-band)', 'class': 'Galaxy Class'},
              color='class',
              color_discrete_sequence=colors)

fig6.update_layout(
    template='plotly_dark',
    font=dict(size=12, color='white'),
    title=dict(font=dict(size=18, color='white')),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(17, 17, 17, 1)'
)

fig6.update_xaxes(title_font=dict(size=14, color='white'), tickfont=dict(color='white'))
fig6.update_yaxes(title_font=dict(size=14, color='white'), tickfont=dict(color='white'), gridcolor='rgba(255, 255, 255, 0.1)')

fig6.add_trace(go.Scatter(
    x=[min(df['class']), max(df['class'])],
    y=[min(df['petroR90_r']), max(df['petroR90_r'])],
    mode='markers',
    marker=dict(
        color='rgba(255, 255, 255, 0.1)',
        size=2,
        symbol='star'
    ),
    hoverinfo='none'
))

fig6.show()

In [None]:
df['u-r'] = df['u-g'] + df['g-r']
df['concentration'] = df['petroR90_r'] / df['petroR50_r']
df['size'] = df['petroR90_r']
df['brightness'] = -df['m4_r']
df['color_gradient'] = df['petroR50_u'] / df['petroR50_r']

In [None]:
df['size_category'] = pd.qcut(df['size'], q=3, labels=['Small', 'Medium', 'Large'])
fig7 = px.scatter(df, x='ecc', y='concentration', color='class', facet_col='size_category',
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 title='Eccentricity vs. Concentration for Different Galaxy Sizes',
                 labels={'ecc': 'Eccentricity', 'concentration': 'Concentration (R90/R50)',
                         'size_category': 'Galaxy Size Category'},
                 hover_data=['brightness', 'u-r'])
fig7.update_layout(template='plotly_dark')
fig7.show()

In [None]:
fig8 = make_subplots(rows=2, cols=2, subplot_titles=('u-g vs g-r', 'g-r vs r-i', 'r-i vs i-z', 'u-g vs i-z'))
for i, (x, y) in enumerate([('g-r', 'u-g'), ('r-i', 'g-r'), ('i-z', 'r-i'), ('i-z', 'u-g')]):
    fig8.add_trace(
        go.Scatter(x=df[x], y=df[y], mode='markers', marker=dict(color=df['brightness'], colorscale='Viridis',
                   showscale=i==3, size=5), text=df['class'], name=''),
        row=i//2+1, col=i%2+1
    )
fig8.update_layout(height=800, title_text="Spectral Index Relationships", template='plotly_dark')
fig8.show()

In [None]:
dimensions = ['u-g', 'g-r', 'r-i', 'i-z', 'ecc', 'concentration', 'brightness', 'size']
fig9 = go.Figure(data=
    go.Parcoords(
        line = dict(color = df['u-r'],
                   colorscale = 'Jet',
                   showscale = True),
        dimensions = [dict(range = [df[dim].min(), df[dim].max()],
                           label = dim,
                           values = df[dim])
                      for dim in dimensions]
    )
)
fig9.update_layout(title='Multi dimensional Galaxy Properties', template='plotly_dark')
fig9.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

# **Advance Stacking Classifier Model**



In [None]:
num_f = X.select_dtypes(include=['int64', 'float64']).columns
cat_f = X.select_dtypes(include=['object']).columns

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_f),
        ('cat', cat_transformer, cat_f)
    ])

In [None]:
meta_learner = RandomForestClassifier(random_state=32)

In [None]:
models = [
    ('rf', RandomForestClassifier(random_state=32)),
    ('xgb', XGBClassifier(random_state=32)),
    ('lgbm', LGBMClassifier(random_state=32)),
]


In [None]:
from sklearn.ensemble import StackingClassifier

classifier = StackingClassifier(
    estimators=models,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=32))),
    ('classifier', stacking_classifier)
])

In [None]:
param_dist = {
    'classifier__rf__n_estimators': randint(100, 1000),
    'classifier__rf__max_depth': randint(5, 50),
    'classifier__rf__min_samples_split': randint(2, 20),
    'classifier__rf__min_samples_leaf': randint(1, 10),
    'classifier__xgb__n_estimators': randint(100, 1000),
    'classifier__xgb__learning_rate': uniform(0.01, 0.3),
    'classifier__xgb__max_depth': randint(3, 10),
    'classifier__xgb__subsample': uniform(0.5, 0.5),
    'classifier__xgb__colsample_bytree': uniform(0.5, 0.5),
    'classifier__lgbm__n_estimators': randint(100, 1000),
    'classifier__lgbm__learning_rate': uniform(0.01, 0.3),
    'classifier__lgbm__num_leaves': randint(20, 3000),
    'classifier__lgbm__max_depth': randint(3, 12),
    'classifier__final_estimator__n_estimators': randint(50, 500),
    'classifier__final_estimator__max_depth': randint(3, 10)
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=32,
    n_jobs=-1,
    scoring='f1_weighted'
)

In [None]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


invalid value encountered in cast



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1168
[LightGBM] [Info] Number of data points in the train set: 499, number of used features: 7
[LightGBM] [Info] Start training from score -1.059315
[LightGBM] [Info] Start training from score -1.143702
[LightGBM] [Info] Start training from score -1.094612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1167
[LightGBM] [Info] Number of data points in the train set: 499, number of used features: 7
[LightGBM] [Info] Start training from score -1.059315
[LightGBM] [Info] Start training from score -1.143702
[LightGBM] [Info] Start training from score -1.094612
[LightGBM] [Info] Auto-choosing col-wis

In [None]:
print("Best parameters\n", random_search.best_params_)
print("Best cross-validation score\n", random_search.best_score_)

Best parameters
 {'classifier__final_estimator__max_depth': 3, 'classifier__final_estimator__n_estimators': 103, 'classifier__lgbm__learning_rate': 0.1977196362952926, 'classifier__lgbm__max_depth': 6, 'classifier__lgbm__n_estimators': 386, 'classifier__lgbm__num_leaves': 2497, 'classifier__rf__max_depth': 46, 'classifier__rf__min_samples_leaf': 7, 'classifier__rf__min_samples_split': 8, 'classifier__rf__n_estimators': 996, 'classifier__xgb__colsample_bytree': 0.902461254551477, 'classifier__xgb__learning_rate': 0.2738948199775719, 'classifier__xgb__max_depth': 3, 'classifier__xgb__n_estimators': 214, 'classifier__xgb__subsample': 0.8279236766601542}
Best cross-validation score
 0.8921140535446739


In [None]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
print("\nClassification Report")
print(classification_report(y_test, y_pred))


Classification Report
              precision    recall  f1-score   support

  elliptical       0.90      0.98      0.93        44
      merger       0.81      0.64      0.72        61
      spiral       0.72      0.84      0.77        51

    accuracy                           0.80       156
   macro avg       0.81      0.82      0.81       156
weighted avg       0.80      0.80      0.80       156



In [None]:
print("\nConfusion Matrix")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix
[[43  1  0]
 [ 5 39 17]
 [ 0  8 43]]
