# Importing dataframe

In [46]:
import pandas as pd
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix

datapath = r'C:\Users\selby\Documents\Jupyter\Build Week Project\trimmed.csv'

df = pd.read_csv(datapath)

df.head()

Unnamed: 0.1,Unnamed: 0,id64,bodyId,subType,radius,surfaceTemperature,rotationalPeriod,isMainStar,age,spectralClass,luminosity,absoluteMagnitude,solarMasses,solarRadius
0,1,3.602943e+16,1.0,K (Yellow-Orange) Star,,3715,2.519077,1.0,7344.0,K9,Va,7.578094,0.527344,0.682587
1,18,1.441158e+17,4.0,L (Brown dwarf) Star,,1766,1.221851,0.0,7344.0,L3,V,12.59494,0.152344,0.299896
2,19,1.08087e+17,3.0,M (Red dwarf) Star,,2882,1.542847,0.0,7344.0,M4,Va,9.608398,0.289063,0.445374
3,33,593375900000.0,0.0,M (Red dwarf) Star,,2557,2.024965,1.0,2346.0,M6,Va,10.13382,0.269531,0.444214
4,57,149579100000000.0,0.0,F (White) Star,,6594,2.192743,1.0,244.0,F6,Vab,3.810394,1.390625,1.227814


In [47]:
print(df.columns)

Index(['Unnamed: 0', 'id64', 'bodyId', 'subType', 'radius',
       'surfaceTemperature', 'rotationalPeriod', 'isMainStar', 'age',
       'spectralClass', 'luminosity', 'absoluteMagnitude', 'solarMasses',
       'solarRadius'],
      dtype='object')


## Dropping unnessesary columns and spectralClass, which would have caused leakage.

In [48]:
df.drop(df[['Unnamed: 0', 'id64', 'bodyId','radius','spectralClass']],axis=1,inplace=True)

In [49]:
df['subType'].unique()

array(['K (Yellow-Orange) Star', 'L (Brown dwarf) Star',
       'M (Red dwarf) Star', 'F (White) Star', 'A (Blue-White) Star',
       'Neutron Star', 'G (White-Yellow) Star', 'Y (Brown dwarf) Star',
       'T (Brown dwarf) Star', 'T Tauri Star', 'White Dwarf (DC) Star',
       'B (Blue-White) Star', 'K (Yellow-Orange giant) Star',
       'Herbig Ae/Be Star', 'A (Blue-White super giant) Star',
       'O (Blue-White) Star', 'M (Red giant) Star', 'Black Hole',
       'White Dwarf (DA) Star', 'White Dwarf (DAB) Star',
       'White Dwarf (DCV) Star', 'White Dwarf (DAV) Star', 'MS-type Star',
       'S-type Star', 'B (Blue-White super giant) Star', 'CN Star',
       'White Dwarf (DB) Star', 'Wolf-Rayet C Star',
       'White Dwarf (DBV) Star', 'Wolf-Rayet O Star', 'Wolf-Rayet N Star',
       'G (White-Yellow super giant) Star', 'CJ Star',
       'Wolf-Rayet NC Star', 'F (White super giant) Star',
       'White Dwarf (DAZ) Star', 'M (Red super giant) Star'], dtype=object)

# Writing a function to create a dataframe including only main sequence stars.

In [50]:
main = ['M (Red dwarf) Star',
        'O (Blue-White) Star',
        'A (Blue-White) Star',
        'B (Blue-White) Star',
        'F (White) Star',
        'K (Yellow-Orange) Star',
        'G (White-Yellow) Star',
        'A (Blue-White super giant) Star',
        'M (Red giant) Star',
        'B (Blue-White super giant) Star',
        'G (White-Yellow super giant) Star',
        'F (White super giant) Star',
        'M (Red super giant) Star']

def subType(x):
    if x in main:
        return 1
    else:
        return 0
    

df['category'] = df['subType'].apply(subType)

df = df[df['category'] == 1]
df.drop(columns='category',inplace=True)
df.dropna(inplace=True)

In [51]:
df.shape

(412786, 9)

In [52]:
df.head()

Unnamed: 0,subType,surfaceTemperature,rotationalPeriod,isMainStar,age,luminosity,absoluteMagnitude,solarMasses,solarRadius
0,K (Yellow-Orange) Star,3715,2.519077,1.0,7344.0,Va,7.578094,0.527344,0.682587
2,M (Red dwarf) Star,2882,1.542847,0.0,7344.0,Va,9.608398,0.289063,0.445374
3,M (Red dwarf) Star,2557,2.024965,1.0,2346.0,Va,10.13382,0.269531,0.444214
4,F (White) Star,6594,2.192743,1.0,244.0,Vab,3.810394,1.390625,1.227814
5,M (Red dwarf) Star,2457,1.938148,0.0,4290.0,Va,10.616226,0.25,0.385284


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 412786 entries, 0 to 549541
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   subType             412786 non-null  object 
 1   surfaceTemperature  412786 non-null  int64  
 2   rotationalPeriod    412786 non-null  float64
 3   isMainStar          412786 non-null  float64
 4   age                 412786 non-null  float64
 5   luminosity          412786 non-null  object 
 6   absoluteMagnitude   412786 non-null  float64
 7   solarMasses         412786 non-null  float64
 8   solarRadius         412786 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 31.5+ MB


In [54]:
df['subType'].unique()

array(['K (Yellow-Orange) Star', 'M (Red dwarf) Star', 'F (White) Star',
       'A (Blue-White) Star', 'G (White-Yellow) Star',
       'B (Blue-White) Star', 'A (Blue-White super giant) Star',
       'O (Blue-White) Star', 'M (Red giant) Star',
       'B (Blue-White super giant) Star',
       'G (White-Yellow super giant) Star', 'F (White super giant) Star',
       'M (Red super giant) Star'], dtype=object)

### Creating a profile report for the dataset

In [55]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df,title='Star Data Report')

In [56]:
# Uncomment to see DataFrame report

# profile.to_widgets()

In [57]:
df['subType'] = df['subType'].str.split(' ',1).str[0]

In [58]:
df.head()

Unnamed: 0,subType,surfaceTemperature,rotationalPeriod,isMainStar,age,luminosity,absoluteMagnitude,solarMasses,solarRadius
0,K,3715,2.519077,1.0,7344.0,Va,7.578094,0.527344,0.682587
2,M,2882,1.542847,0.0,7344.0,Va,9.608398,0.289063,0.445374
3,M,2557,2.024965,1.0,2346.0,Va,10.13382,0.269531,0.444214
4,F,6594,2.192743,1.0,244.0,Vab,3.810394,1.390625,1.227814
5,M,2457,1.938148,0.0,4290.0,Va,10.616226,0.25,0.385284


# Establishing Baseline

In [59]:
df['subType'].value_counts(normalize=True)

M    0.489174
K    0.250062
F    0.107082
G    0.083976
A    0.052691
B    0.015303
O    0.001713
Name: subType, dtype: float64

## Creating Target vector and feature matrix, then spliting matrix into training, validation, and testing sets

In [60]:
target = 'subType'

y = df[target]
X = df.drop(columns=target)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [62]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25,random_state=42)

In [63]:
X_train.head()

Unnamed: 0,surfaceTemperature,rotationalPeriod,isMainStar,age,luminosity,absoluteMagnitude,solarMasses,solarRadius
372124,6282,6.213729,1.0,2090.0,Vb,4.281921,1.121094,1.088806
43442,14527,1.000007,1.0,166.0,Vz,-1.098831,4.574219,2.424896
225781,5708,4.288684,1.0,12860.0,Vab,4.881897,0.894531,1.000488
465971,4402,3.296774,1.0,1480.0,Va,6.445297,0.683594,0.819
193257,4343,4.577268,0.0,1530.0,Va,6.744339,0.597656,0.733185


### Linear Model

In [64]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

model_l = make_pipeline(
    OrdinalEncoder(),
    OneVsRestClassifier(SGDClassifier(alpha=0.00001,
                                      n_jobs=-1,
                                      random_state=42,
                                     shuffle=True),
                        n_jobs=-1)
    )

model_l.fit(X_train,y_train);

In [65]:
print('OVR Training set score:',model_l.score(X_train,y_train))
print('OVR Validation set score:',model_l.score(X_val,y_val))

OVR Training set score: 0.6788521869738484
OVR Validation set score: 0.6758360890051722


### Gradient Boosting model

In [66]:
from xgboost import XGBClassifier

model_x = make_pipeline(
    OrdinalEncoder(),
    XGBClassifier()
    )

model_x.fit(X_train,y_train);





In [67]:
print('XGB Training set score:',model_x.score(X_train,y_train))
print('XGB Validation set score:',model_x.score(X_val,y_val))

XGB Training set score: 1.0
XGB Validation set score: 0.9999878871567524


# Final testing score

In [68]:
from sklearn.metrics import accuracy_score

y_pred_x = model_x.predict(X_test)

print('XGB model testing data score:',accuracy_score(y_pred_x,y_test))

XGB model testing data score: 0.999975774606943


# Just for fun, permutation importances

In [69]:
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt

perm_importance = permutation_importance(model_x,X_val,y_val)

d = {'Mean Importance': perm_importance['importances_mean'],'Standard Deviation': perm_importance['importances_std']}

permutation_importances = pd.DataFrame(data=d,index=X.columns)

permutation_importances

Unnamed: 0,Mean Importance,Standard Deviation
surfaceTemperature,0.678249,0.000967
rotationalPeriod,2.9e-05,1e-05
isMainStar,0.0,0.0
age,1.2e-05,0.0
luminosity,1.2e-05,0.0
absoluteMagnitude,0.000501,5.8e-05
solarMasses,0.0,0.0
solarRadius,0.0,0.0
