In [41]:
# Libraries
import pandas as pd
import numpy as np

In [42]:
# Create the dataframe (star_df) from csv data
star_df = pd.read_csv("star_type_.csv")

In [43]:
star_df.sample(10)

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type
53,3749,550000.0,1648.0,-8.05,Supergiant
88,13720,0.00018,0.00892,12.97,White Dwarf
52,3834,272000.0,1183.0,-9.2,Supergiant
98,12098,689.0,7.01,0.02,Main Sequence
111,3605,126000.0,1124.0,-10.81,Supergiant
197,3496,0.00125,0.336,14.94,Red Dwarf
26,8570,0.00081,0.0097,14.2,White Dwarf
180,2831,0.000231,0.0915,16.21,Brown Dwarf
49,33750,220000.0,26.0,-6.1,Hypergiant
28,11790,0.00015,0.011,12.59,White Dwarf


In [44]:
# Fetch generic info
star_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 9.5+ KB


In [45]:
# Target count
star_df['Star type'].value_counts()

Star type
Brown Dwarf      40
Red Dwarf        40
White Dwarf      40
Main Sequence    40
Hypergiant       40
Supergiant       40
Name: count, dtype: int64

In [46]:
# Save input features in X and target output in y
X = star_df.iloc[:, :-1]
X

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv)
0,3068,0.002400,0.1700,16.12
1,3042,0.000500,0.1542,16.60
2,2600,0.000300,0.1020,18.70
3,2800,0.000200,0.1600,16.65
4,1939,0.000138,0.1030,20.06
...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93
236,30839,834042.000000,1194.0000,-10.63
237,8829,537493.000000,1423.0000,-10.73
238,9235,404940.000000,1112.0000,-11.23


In [47]:
y = star_df.iloc[:, -1]
y

0      Brown Dwarf
1      Brown Dwarf
2      Brown Dwarf
3      Brown Dwarf
4      Brown Dwarf
          ...     
235     Supergiant
236     Supergiant
237     Supergiant
238     Supergiant
239     Supergiant
Name: Star type, Length: 240, dtype: object

In [48]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

## Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Create the Pipeline with Scaler and ML Model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Create the pipeline with Standard Scaler and Logistic Regression Model
pipeline = Pipeline([
    ('scaler', StandardScaler()),          # Step 1
    ('classifier', LogisticRegression(solver='newton-cg', multi_class='multinomial'))   # Step 2
])


In [50]:
# Train the Logistic Regression using Pipeline
pipeline.fit(X_train, y_train)



0,1,2
,steps,"[('scaler', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'newton-cg'
,max_iter,100


In [51]:
# Make the predictions
preds = pipeline.predict(X_test)

In [52]:
# Print the results
actual = y_test.values
print(f'This is the actual output:-\n{actual}')
print(f'\nThis is the predicted output:-\n{preds}')


This is the actual output:-
['White Dwarf' 'Brown Dwarf' 'Main Sequence' 'Hypergiant' 'Hypergiant'
 'Supergiant' 'Supergiant' 'White Dwarf' 'Brown Dwarf' 'White Dwarf'
 'Hypergiant' 'White Dwarf' 'Supergiant' 'Hypergiant' 'Supergiant'
 'Supergiant' 'Brown Dwarf' 'Red Dwarf' 'Main Sequence' 'Brown Dwarf'
 'Brown Dwarf' 'Red Dwarf' 'Supergiant' 'Main Sequence' 'Supergiant'
 'Main Sequence' 'Red Dwarf' 'White Dwarf' 'Supergiant' 'Main Sequence'
 'Main Sequence' 'Hypergiant' 'White Dwarf' 'Brown Dwarf' 'Red Dwarf'
 'Brown Dwarf' 'Red Dwarf' 'Supergiant' 'Red Dwarf' 'Supergiant'
 'Hypergiant' 'Supergiant' 'Hypergiant' 'Red Dwarf' 'Main Sequence'
 'Brown Dwarf' 'Hypergiant' 'Main Sequence']

This is the predicted output:-
['White Dwarf' 'Brown Dwarf' 'Red Dwarf' 'Hypergiant' 'Hypergiant'
 'Supergiant' 'Supergiant' 'White Dwarf' 'Brown Dwarf' 'White Dwarf'
 'Hypergiant' 'White Dwarf' 'Supergiant' 'Hypergiant' 'Supergiant'
 'Supergiant' 'Brown Dwarf' 'Red Dwarf' 'Main Sequence' 'Brown Dwarf'
 

In [53]:
# Check on which index the prediction did not match the actual output
incorrect_indexes = np.where(actual != preds)[0]

# Print the actual and predicted label for the incorrect_indexes
for i, index in enumerate(incorrect_indexes):
    print(f'{i+1}) The acutal output was:- {actual[index]}')
    print(f'{i+1}) The predicted output by the model is:- {preds[index]}')
    print()



1) The acutal output was:- Main Sequence
1) The predicted output by the model is:- Red Dwarf

2) The acutal output was:- Main Sequence
2) The predicted output by the model is:- Hypergiant



In [54]:
# Get the accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(actual, preds))

0.9583333333333334


In [55]:
# Save the pipeline
from pickle import dump
with open('model.pkl', 'wb') as file:
    dump(pipeline, file)


In [56]:
# Load the pipeline and test it
from pickle import load
with open('model.pkl', 'rb') as file:
    pipeline_test = load(file)

In [57]:
# Get data from test set
X_test.iloc[1, :]

Temperature (K)           2637.00000
Luminosity(L/Lo)             0.00073
Radius(R/Ro)                 0.12700
Absolute magnitude(Mv)      17.22000
Name: 6, dtype: float64

In [58]:
# Feature list
features = X_test.columns.to_list()

In [59]:
# Prediction
test_data = pd.DataFrame([[2637, 0.00073, 0.127, 17.22]], columns=features)
output = pipeline_test.predict(test_data)
print(output)

['Brown Dwarf']
