In [8]:
# Helper packages
import numpy as np
import pandas as pd

# Modeling packages
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import metrics


In [9]:
df = pd.read_csv('/workspaces/all_in_one/team1proj/StellarDatasetML/data/stellar_eda.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79999 entries, 0 to 79998
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   obj_ID       79999 non-null  float64
 1   alpha        79999 non-null  float64
 2   delta        79999 non-null  float64
 3   u            79999 non-null  float64
 4   g            79999 non-null  float64
 5   r            79999 non-null  float64
 6   i            79999 non-null  float64
 7   z            79999 non-null  float64
 8   run_ID       79999 non-null  int64  
 9   rerun_ID     79999 non-null  int64  
 10  cam_col      79999 non-null  int64  
 11  field_ID     79999 non-null  int64  
 12  spec_obj_ID  79999 non-null  float64
 13  class        79999 non-null  object 
 14  redshift     79999 non-null  float64
 15  plate        79999 non-null  int64  
 16  MJD          79999 non-null  int64  
 17  fiber_ID     79999 non-null  int64  
dtypes: float64(10), int64(7), object(1)
memory usa

# Logistic Regression

In [11]:
#creat a data split
#List of all numarical columns that are going to be used in our modelling process
column_list = ['u','g','r','i','z','redshift', 'alpha', 'delta', 'MJD']
#defning the input and the output
X = df[column_list]
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)


In [12]:
# Z-score
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
print('X_train dimension= ', X_train.shape)
print('X_test dimension= ', X_test.shape)
print('y_train dimension= ', y_train.shape)
print('y_train dimension= ', y_test.shape)

X_train dimension=  (55999, 9)
X_test dimension=  (24000, 9)
y_train dimension=  (55999,)
y_train dimension=  (24000,)


## Fitting The Model

In [14]:
lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
lm.fit(X_train, y_train)


LogisticRegression(multi_class='ovr', solver='liblinear')

In [15]:
# print('Predicted value is =', lm.predict([X_test[52]]))

# print('Actual value from test data is %s' % (y_test[52]) )


## Assessing model accuracy

###   - Model Score

In [16]:
lm.score(X_test, y_test)

0.9332083333333333

###   - Classification Report

In [17]:
print(metrics.classification_report(y_test, lm.predict(X_test)))

              precision    recall  f1-score   support

      GALAXY       0.94      0.95      0.94     14303
         QSO       0.91      0.90      0.91      4542
        STAR       0.93      0.91      0.92      5155

    accuracy                           0.93     24000
   macro avg       0.93      0.92      0.93     24000
weighted avg       0.93      0.93      0.93     24000



### - Cross-validation performance

In [18]:
cv_lr = LogisticRegressionCV(
  cv=5, 
  solver='liblinear',
  multi_class='ovr',
  random_state=123).fit(X_train, y_train)
auc = cv_lr.score(X_train, y_train)
round(auc, 3)


0.955