In [42]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [34]:
file_path = "./Data/gz2_hart16.csv.gz"
columns_to_use = [
    "t01_smooth_or_features_a01_smooth_debiased",  # Smooth fraction
    "t01_smooth_or_features_a02_features_or_disk_debiased",  # Features fraction
    "t01_smooth_or_features_a03_star_or_artifact_debiased",  # Artifact fraction
    "gz2_class"  # Target
]

In [35]:
with gzip.open(file_path, "rt") as f:
    data_preview = pd.read_csv(f, nrows=1)

print("Available columns:",data_preview.columns.tolist())
# print("Available columns:",len(data_preview.columns.tolist()))

Available columns: ['dr7objid', 'ra', 'dec', 'rastring', 'decstring', 'sample', 'gz2_class', 'total_classifications', 'total_votes', 't01_smooth_or_features_a01_smooth_count', 't01_smooth_or_features_a01_smooth_weight', 't01_smooth_or_features_a01_smooth_fraction', 't01_smooth_or_features_a01_smooth_weighted_fraction', 't01_smooth_or_features_a01_smooth_debiased', 't01_smooth_or_features_a01_smooth_flag', 't01_smooth_or_features_a02_features_or_disk_count', 't01_smooth_or_features_a02_features_or_disk_weight', 't01_smooth_or_features_a02_features_or_disk_fraction', 't01_smooth_or_features_a02_features_or_disk_weighted_fraction', 't01_smooth_or_features_a02_features_or_disk_debiased', 't01_smooth_or_features_a02_features_or_disk_flag', 't01_smooth_or_features_a03_star_or_artifact_count', 't01_smooth_or_features_a03_star_or_artifact_weight', 't01_smooth_or_features_a03_star_or_artifact_fraction', 't01_smooth_or_features_a03_star_or_artifact_weighted_fraction', 't01_smooth_or_features_a03

In [36]:
with gzip.open(file_path, "rt") as f:
    df = pd.read_csv(f,usecols=columns_to_use, nrows=50000)

In [37]:
print(df.head())

  gz2_class  t01_smooth_or_features_a01_smooth_debiased  \
0      Sc+t                                       0.000   
1      Sb+t                                       0.024   
2        Ei                                       0.780   
3      Sc+t                                       0.036   
4        Er                                       0.767   

   t01_smooth_or_features_a02_features_or_disk_debiased  \
0                                              0.988      
1                                              0.976      
2                                              0.139      
3                                              0.964      
4                                              0.186      

   t01_smooth_or_features_a03_star_or_artifact_debiased  
0                                              0.012     
1                                              0.000     
2                                              0.081     
3                                              0.000     


In [39]:
df["label"] = df["gz2_class"].str.startswith("E").astype(int)

In [None]:
X = df[["t01_smooth_or_features_a01_smooth_debiased",
        "t01_smooth_or_features_a02_features_or_disk_debiased",
        "t01_smooth_or_features_a03_star_or_artifact_debiased"]]
y = df["label"]

In [41]:
X = X.fillna(X.mean())

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9984


In [None]:
import joblib

joblib.dump(model, "model.pkl")
print("Model saved as model.pkl")

Model saved as model.pkl
