In [244]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [245]:
# Gonna read in our data
df = pd.read_csv("data/Stars.csv")

In [246]:
# Let's take a look
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [247]:
# Data types are all as I'd expect them to be
df.dtypes

Temperature         int64
L                 float64
R                 float64
A_M               float64
Color              object
Spectral_Class     object
Type                int64
dtype: object

In [248]:
# Let's next check on our range of colors to make sure they are consistent.  Annnd they're not.
df.value_counts("Color", ascending = True)

Color
Blue-White              1
Orange-Red              1
Pale yellow orange      1
White-Yellow            1
Yellowish               1
yellowish               2
Orange                  2
Whitish                 2
Yellowish White         3
white                   3
Blue white              4
White                   7
yellow-white            8
Blue White             10
Blue-white             26
Blue                   56
Red                   112
dtype: int64

In [249]:
# Gonna go through our Colors and combine those that make sense to combine
colors = []
for item in df["Color"]:
    if (item.find('Blue-') != -1):
        item2 = "Blue-White"
    elif (item.find("Blue W") != -1):
        item2 = "Blue-White"
    elif (item.find("Blue w") != -1):
        item2 = "Blue-White"
    elif (item.find("ellowish") != -1):
        item2 = "Yellow"
    elif (item.find("white") != -1):
        item2 = "White"
    elif (item.find("White-Y") != -1):
        item2 = "Yellow"
    elif (item.find("Whitish") != -1):
        item2 = "White"
    elif (item.find("Orange") != -1):
        item2 = "Yellow"
    elif (item.find("-Red") != -1):
        item2 = "Red"
    elif (item.find("yellow") != -1):
        item2 = "Yellow"
    else:
        item2 = item
    colors.append(item2)

# Drop our old colors
df.drop(columns=["Color"], inplace = True)

# Add our new colors to ther dataframe
df = pd.concat([df, pd.Series(colors)], axis=1)
df.head()

Unnamed: 0,Temperature,L,R,A_M,Spectral_Class,Type,0
0,3068,0.0024,0.17,16.12,M,0,Red
1,3042,0.0005,0.1542,16.6,M,0,Red
2,2600,0.0003,0.102,18.7,M,0,Red
3,2800,0.0002,0.16,16.65,M,0,Red
4,1939,0.000138,0.103,20.06,M,0,Red


In [250]:
df.isnull().mean()

Temperature       0.0
L                 0.0
R                 0.0
A_M               0.0
Spectral_Class    0.0
Type              0.0
0                 0.0
dtype: float64

In [251]:
df.value_counts("Spectral_Class", ascending = True)

Spectral_Class
G      1
K      6
F     17
A     19
O     40
B     46
M    111
dtype: int64

In [252]:
y = df["Type"]

In [253]:
color = pd.get_dummies(data=df[0], drop_first = True)
spectral_class = pd.get_dummies(data=df['Spectral_Class'])
df.drop(columns=['Spectral_Class'], inplace = True)
df.drop(columns=[0], inplace = True)
df.drop(columns=['Type'], inplace = True)
df = pd.concat([df, color, spectral_class], axis=1)
df.drop(columns=['G'], inplace = True)
X = df
df.head()

Unnamed: 0,Temperature,L,R,A_M,Blue-White,Red,White,Yellow,A,B,F,K,M,O
0,3068,0.0024,0.17,16.12,0,1,0,0,0,0,0,0,1,0
1,3042,0.0005,0.1542,16.6,0,1,0,0,0,0,0,0,1,0
2,2600,0.0003,0.102,18.7,0,1,0,0,0,0,0,0,1,0
3,2800,0.0002,0.16,16.65,0,1,0,0,0,0,0,0,1,0
4,1939,0.000138,0.103,20.06,0,1,0,0,0,0,0,0,1,0


In [254]:
X.shape

(240, 14)

In [255]:
y.shape

(240,)

In [256]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [257]:
Model = LogisticRegression()
params = {
    'C' : [.00001, .0001, .001, .01, 1],
    'solver' : ['liblinear']
    }

In [258]:
model_gridsearch = GridSearchCV(Model, param_grid = params, cv=5) 
        
# Fit our model and print metrics
model_gridsearch.fit(X_train, y_train);
print(f'Best Train score for Gridsearched w/out scaling {np.round(model_gridsearch.best_score_, 3)}')

print(' ')
best = model_gridsearch.best_estimator_
print(f'Best Test score for Gridsearched w/out scaling {np.round(best.score(X_test, y_test), 3)}')


Best Train score for Gridsearched w/out scaling 0.85
 
Best Test score for Gridsearched w/out scaling 0.9


In [262]:
# Exponentiate our coef's and print them with their associated features
odds = list(np.exp(best.coef_))
odds = odds[0]
print('Exponentiate our coefs:')
print('')
for i in range(len(X.columns)):
    if odds[i] > 0:
        print(X.columns[i])
        print(np.round(odds[i], 3))
        print(' ')

Exponentiate our coefs:

Temperature
0.999
 
L
0.988
 
R
0.993
 
A_M
1.313
 
Blue-White
1.0
 
Red
1.01
 
White
0.998
 
Yellow
0.999
 
A
0.999
 
B
1.0
 
F
0.999
 
K
0.999
 
M
1.01
 
O
1.0
 


In [None]:
# Plot our odds and look at what features have the largest odds
pd.Series(odds, index=X_2.columns).plot.bar(figsize=(15, 7))
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.ylim(ymin=0.8, ymax=1.3);

In [242]:
Model = ExtraTreesClassifier()
params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}

In [243]:
model_gridsearch = GridSearchCV(Model, param_grid = params, cv=5) 
        
# Fit our model and print metrics
model_gridsearch.fit(X_train, y_train);
print(f'Best Train score for Gridsearched w/out scaling {np.round(model_gridsearch.best_score_, 3)}')

print(' ')
best = model_gridsearch.best_estimator_
print(f'Best Test score for Gridsearched w/out scaling {np.round(best.score(X_test, y_test), 3)}')


Best Train score for Gridsearched w/out scaling 0.994
 
Best Test score for Gridsearched w/out scaling 0.967
