In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.feature_selection import mutual_info_classif

In [None]:
file_pattern = "SDSS ({num}).csv"
file_numbers = range(1, 18)
files = [file_pattern.format(num=num) for num in file_numbers]

sdss_combined = pd.DataFrame()

for file in files:
    sdss = pd.read_csv(file)

    sdss_combined = pd.concat([sdss_combined, sdss], ignore_index=True)

print(f"Dimensionality: {np.shape(sdss_combined)[0] * np.shape(sdss_combined)[1]}")
sdss_combined

In [None]:
sdss_cleaned = sdss_combined.copy()
sdss_cleaned = sdss_cleaned.drop(columns=['plate', 'mjd', 'fiberid', 'zErr', 'zWarning', 'bestObjID', 'objType', 'run', 'rerun', 'camCol', 'field', 'obj'])
sdss_cleaned

In [None]:
sdss_processed = sdss_cleaned.copy()
sdss_processed['psf_u-g'] = sdss_processed['psfMag_u'] - sdss_processed['psfMag_g']
sdss_processed['psf_g-r'] = sdss_processed['psfMag_g'] - sdss_processed['psfMag_r']
sdss_processed['psf_r-i'] = sdss_processed['psfMag_r'] - sdss_processed['psfMag_i']
sdss_processed['psf_i-z'] = sdss_processed['psfMag_i'] - sdss_processed['psfMag_z']
sdss_processed['petro_u-g'] = sdss_processed['petroMag_u'] - sdss_processed['petroMag_g']
sdss_processed['petro_g-r'] = sdss_processed['petroMag_g'] - sdss_processed['petroMag_r']
sdss_processed['petro_r-i'] = sdss_processed['petroMag_r'] - sdss_processed['petroMag_i']
sdss_processed['petro_i-z'] = sdss_processed['petroMag_i'] - sdss_processed['petroMag_z']

In [None]:
sdss_final = sdss_processed.copy()
sdss_final.loc[sdss_final['class'] == 'STAR', 'Target'] = 0  # 0 stands for stars
sdss_final.loc[sdss_final['class'] == 'GALAXY', 'Target'] = 1  # 1 stands for galaxies
sdss_final.loc[sdss_final['class'] == 'QSO', 'Target'] = 2  # 2 stands for quasars
sdss_final['Target'] = sdss_final['Target'].astype('int')
sdss_final = sdss_final.drop(columns=['class'])
sdss_final

Feature Selection

In [None]:
# Target Features with a low p-value are more likely to be related to the target variable
X = sdss_final.drop('Target', axis=1)
Y = sdss_final['Target']
anova_results = {}
for column in X.columns:
    groups = []
    for category in Y.unique():
        groups.append(X[column][Y == category])
    F, p = stats.f_oneway(*groups)
    anova_results[column] = p


for feature, p_value in anova_results.items():
    print(f"Feature: {feature}, ANOVA p-value: {p_value}")

In [None]:
# Calculating MI scores
mi_scores = mutual_info_classif(X, Y)
indices = np.argsort(mi_scores)[::-1]

# Sorting feature names based on MI scores
sorted_feature_names = [X.columns[i] for i in indices]

# Creating horizontal bar plot
plt.figure(figsize=(10, 6))
plt.title("Mutual Information Scores")
plt.barh(range(X.shape[1]), mi_scores[indices], color="b", align="center")
plt.yticks(range(X.shape[1]), sorted_feature_names)
plt.gca().invert_yaxis()  # To display the highest score at the top

# Drawing a red dashed line at 0.1
plt.axvline(x=0.1, color='r', linestyle='--')

plt.show()

In [None]:
# Based on the ANOVA analysis, the (r-i) and (i-z) colors are not contributing much to the results. Therefore, they should be removed from the dataset
# However, the MI score tells us all the petrosian magnitudes are not contributing much to the prediction. Based on physics instinct, the petrosian magnitudes should provide some information about the surface flux of the galaxies, so we still want to reserve this information, therefore disregarding the MI scores.
sdss_final = sdss_final.drop(columns=['psf_r-i', 'psf_i-z', 'petro_r-i', 'petro_i-z'])
sdss_final

In [None]:
print(f"Number of Stars: {np.shape(sdss_final[sdss_final['Target'] == 0])[0]}")
print(f"Number of Galaxies: {np.shape(sdss_final[sdss_final['Target'] == 1])[0]}")
print(f"Number of Quasars: {np.shape(sdss_final[sdss_final['Target'] == 2])[0]}")
sdss_final.to_csv('SDSS_processed.csv', index=False)

Data Exploration

In [None]:
# Explore the data using the finalized dataframe
SDSS = sdss_final.copy()

# Define a color map for the targets
target_colors = {0: '#377eb8', 1: '#4daf4a', 2: '#e41a1c'}  # blue, green, red
star_quasar = {0: '#377eb8', 2: '#e41a1c'}  # blue, red
galaxy_quasar = {1: '#4daf4a', 2: '#e41a1c'}  # green, red

# Assign colors to each target category
colors = SDSS['Target'].map(target_colors)
# Create a custom legend
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Stars',
                          markerfacecolor='#377eb8', markersize=10),
                   Line2D([0], [0], marker='o', color='w', label='Galaxies',
                          markerfacecolor='#4daf4a', markersize=10),
                   Line2D([0], [0], marker='o', color='w', label='Quasars',
                          markerfacecolor='#e41a1c', markersize=10)]
alphas = [0.1, 0.2, 0.2]

plt.figure(figsize=(20, 6))
# Plot for star vs. quasar
plt.subplot(1, 3, 1)  # This allows us to have two plots side by side
for target in star_quasar.keys():
    subset = SDSS[SDSS['Target'] == target]
    plt.scatter(subset['psf_u-g'], subset['psf_g-r'], c=target_colors[target],
                alpha=alphas[target], label=target)
plt.xlabel('u-g')
plt.ylabel('g-r')
plt.xlim([-2,6])
plt.ylim([-1,3])
plt.title('Star vs. Quasar Distribution')
plt.grid(True, alpha = 0.3)
# We will create the legend later to avoid duplication

# Plot for galaxy vs. quasar
plt.subplot(1, 3, 2)
for target in galaxy_quasar.keys():
    subset = SDSS[SDSS['Target'] == target]
    plt.scatter(subset['psf_u-g'], subset['psf_g-r'], c=target_colors[target],
                alpha=alphas[target], label=target)
plt.xlabel('u-g')
plt.ylabel('g-r')
plt.xlim([-2,6])
plt.ylim([-1,3])
plt.title('Galaxy vs. Quasar Distribution')
plt.grid(True, alpha = 0.3)

# Plot for combined
plt.subplot(1, 3, 3)
for target in target_colors.keys():
    subset = SDSS[SDSS['Target'] == target]
    plt.scatter(subset['psf_u-g'], subset['psf_g-r'], c=target_colors[target],
                alpha=alphas[target], label=target)
plt.xlabel('u-g')
plt.ylabel('g-r')
plt.xlim([-2,6])
plt.ylim([-1,3])
plt.title('Star vs. Galaxy vs. Quasar Distribution')
plt.grid(True, alpha = 0.3)

# Since the legend will be the same for both, we only need to create it once.
plt.figlegend(handles=legend_elements, loc='upper center', ncol=3)

# Show the plot with both the PSF and Petro colors
plt.show()

In [None]:
category_numeric = {0: 1, 1: 2, 2: 3}
SDSS['Category_numeric'] = SDSS['Target'].map(category_numeric)

# Set a seed for reproducibility
np.random.seed(0)

# Define the amount of jitter
jitter_amount = 0.05

plt.figure(figsize=(10, 6))

# Iterate over the categories and plot each one with jitter
for target in target_colors.keys():
    subset = SDSS[SDSS['Target'] == target]
    # Apply jitter by adding a small random number to the 'Category_numeric' values
    jittered_y = subset['Category_numeric'] + np.random.normal(0, jitter_amount, size=len(subset))
    plt.scatter(subset['z'], jittered_y, c=target_colors[target],
                alpha=alphas[target], label=target)

plt.yticks([1, 2, 3], ['Stars', 'Galaxies', 'Quasars'])
plt.xlabel('Redshift (z)')
plt.ylabel('Category')
plt.title('Distribution of Objects by Distance')
plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)
plt.legend(handles=legend_elements)
plt.show()