# **This is a test file**

In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
# Loading the test file

df = pd.read_csv('D:/Desktop/PROJECT/data/raw/Skyserver_CrossID2_3_2025 8_31_01 PM.csv')

# **SDSS Data Column Descriptions**

## **Column Names and Their Meanings**
| **Column Name** | **Description** |
|---------------|----------------|
| **objID** | Unique identifier assigned to each astronomical object in the SDSS catalog. |
| **ra** (Right Ascension) | The celestial equivalent of longitude, measuring the object's position along the celestial sphere in degrees. |
| **dec** (Declination) | The celestial equivalent of latitude, measuring the object's position above or below the celestial equator in degrees. |
| **z** (Redshift) | A measure of how much the object's light has been stretched due to the expansion of the universe. Higher **z** values indicate objects that are farther away. |
| **u, g, r, i, z1** | The magnitudes of the object in five different photometric bands of the SDSS system: |
| **u** | Ultraviolet (~355 nm wavelength). |
| **g** | Green (~475 nm wavelength). |
| **r** | Red (~622 nm wavelength). |
| **i** | Near-infrared (~763 nm wavelength). |
| **z1** | Deep infrared (~913 nm wavelength). |

## **Relevance to Dark Matter Research**
- **Redshift (z)** helps estimate the **distance** of objects and contributes to **large-scale structure studies**, which are crucial in dark matter research.
- **RA & Dec** allow for mapping object positions, useful in **clustering analyses** to identify gravitational lensing effects caused by dark matter.
- **Photometric Magnitudes (u, g, r, i, z1)** are used to classify objects (e.g., stars, galaxies, quasars) and can help determine **mass distributions** influenced by dark matter.



# Step 1: Data Loading & EDA

In [42]:
df.head(5)

Unnamed: 0,objID,ra,dec,z,u,g,r,i,z1
0,1.23767e+18,262.988865,2.802095,-9999,-9999.0,23.45081,24.22334,23.04416,-9999
1,1.23766e+18,264.897663,5.842619,-9999,14.62699,11.9029,11.38807,11.23265,-9999
2,1.23766e+18,283.270997,-2.016658,-9999,14.68502,13.70108,12.31082,13.91246,-9999
3,1.23766e+18,284.646119,-3.67526,-9999,14.64243,14.12167,13.01476,12.96271,-9999
4,1.23767e+18,262.82565,-13.749519,-9999,15.34654,14.53677,13.79858,9.610632,-9999


In [43]:
df.isnull().sum()

objID    0
ra       0
dec      0
z        0
u        0
g        0
r        0
i        0
z1       0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   objID   500 non-null    float64
 1   ra      500 non-null    float64
 2   dec     500 non-null    float64
 3   z       500 non-null    int64  
 4   u       500 non-null    float64
 5   g       500 non-null    float64
 6   r       500 non-null    float64
 7   i       500 non-null    float64
 8   z1      500 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 35.3 KB


In [45]:
df.shape

(500, 9)

In [46]:
df.dropna(inplace=True)
df.shape

(500, 9)

In [47]:
df.columns

Index(['objID', 'ra', 'dec', 'z', 'u', 'g', 'r', 'i', 'z1'], dtype='object')

# Step 2: Data Preprocessing

In [48]:
if "objID" in df.columns:
    df = df.drop(columns=["objID"])

df.columns

Index(['ra', 'dec', 'z', 'u', 'g', 'r', 'i', 'z1'], dtype='object')

In [49]:
# Visualize outliers using a boxplot
# plt.figure(figsize=(10, 6))
# sns.boxplot(data=df[['u', 'g', 'r', 'i', 'z1', 'z']])
# plt.title("Boxplot of Features")
# plt.show()

In [50]:
# Normalize/Scale Features
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Apply scaling
df[['u', 'g', 'r', 'i', 'z1']] = scaler.fit_transform(df[['u', 'g', 'r', 'i', 'z1']])


In [58]:
# Splitting Data into Training & Testing Sets
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df[['u', 'g', 'r', 'i', 'z1']]
y = df['z']  # Predicting redshift

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm shapes
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (400, 5)
Testing data shape: (100, 5)


# Step 3: Model Training

In [57]:
# Training a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 1.0000


# Troubleshooting

In [62]:
# Check correlation with target
correlation = df.corr()["z"].sort_values(ascending=False)
print(correlation)

ra    NaN
dec   NaN
z     NaN
u     NaN
g     NaN
r     NaN
i     NaN
z1    NaN
Name: z, dtype: float64


In [64]:
# 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=50,  # Reduce number of trees
    max_depth=5,  # Limit tree depth
    min_samples_split=10,  # Require more samples to split
    min_samples_leaf=5,  # Avoid deep trees
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Updated Model Accuracy: {accuracy:.4f}")


Updated Model Accuracy: 1.0000


In [66]:
# cross-validation
from sklearn.model_selection import cross_val_score

rf_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Mean Accuracy: {rf_scores.mean():.4f}")


Cross-Validation Mean Accuracy: 1.0000
