In [231]:
import pandas as pd
import numpy as np

In [232]:
df = pd.read_csv('/Users/leona/OneDrive/Documentos/GitHub/Stellar-Classification/data/star_classification.csv')

In [233]:
df.drop(columns=['obj_ID', 'rerun_ID', 'run_ID', 'field_ID', 'spec_obj_ID', 'fiber_ID', 'plate', 'cam_col', 'MJD'], axis=1, inplace=True)

In [234]:
x = df.drop('class', axis=1)
y = df['class']
print(x.shape, y.shape)

(100000, 8) (100000,)


In [235]:
from sklearn.preprocessing import LabelEncoder

In [236]:
le = LabelEncoder()
y = le.fit_transform(y)

In [237]:
from sklearn.feature_selection import SelectKBest, f_classif

In [238]:
algorithm = SelectKBest(score_func=f_classif, k=3)
x_bests = algorithm.fit_transform(x, y)

print(f'Score: {algorithm.scores_}')
print(f'Result of this transformation: {x_bests}')

Score: [2.19488222e+01 2.17588357e+02 3.04453390e+01 2.59625226e+01
 4.58453336e+03 8.28234355e+03 3.23283079e+01 8.34294190e+04]
Result of this transformation: [[20.39501   19.16573    0.6347936]
 [22.58444   21.16812    0.779136 ]
 [20.60976   19.34857    0.6441945]
 ...
 [18.20428   17.69034    0.1433656]
 [19.91386   19.07254    0.4550396]
 [20.60115   20.00959    0.5429442]]


In [239]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler

In [240]:
norm = StandardScaler()
x_norm = norm.fit_transform(x_bests)

In [241]:
x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.30, shuffle=True, random_state=15)

In [242]:
logistic = LogisticRegression(max_iter=2000, tol=0.01)
logistic.fit(x_train, y_train)

accuracy_logistic = logistic.score(x_test, y_test)
print(accuracy_logistic)

0.9463666666666667


In [243]:
prediction_logistic = logistic.predict(x_test)
matrix_logistic = confusion_matrix(y_test, prediction_logistic)
recall_logistic = recall_score(y_test, prediction_logistic, average='macro')
precision_logistic = precision_score(y_test, prediction_logistic, average='macro')
f1_logistic = f1_score(y_test, prediction_logistic, average='macro')

print(matrix_logistic)

[[17102   273   591]
 [  725  4814     6]
 [    2    12  6475]]


In [244]:
neighbors = KNeighborsClassifier()
neighbors.fit(x_train, y_train)

accuracy_neighbors = neighbors.score(x_test, y_test)
print(accuracy_neighbors)

0.9633333333333334


In [245]:
prediction_neighbors = neighbors.predict(x_test)
matrix_neighbors = confusion_matrix(y_test, prediction_neighbors)
precision_neighbors = precision_score(y_test, prediction_neighbors, average='macro')
recall_neighbors = recall_score(y_test, prediction_neighbors, average='macro')
f1_neighbors = f1_score(y_test, prediction_neighbors, average='macro')

print(matrix_neighbors)

[[17377   274   315]
 [  479  5061     5]
 [   27     0  6462]]


In [246]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

accuracy_tree = tree.score(x_test, y_test)
print(accuracy_tree)

0.9594333333333334


In [247]:
prediction_tree = tree.predict(x_test)
matrix_tree = confusion_matrix(y_test, prediction_tree)
precision_tree = precision_score(y_test, prediction_tree, average='macro')
recall_tree = recall_score(y_test, prediction_tree, average='macro')
f1_tree = f1_score(y_test, prediction_tree, average='macro')

print(matrix_tree)

[[17340   603    23]
 [  566  4978     1]
 [   24     0  6465]]


In [248]:
random = RandomForestClassifier()
random.fit(x_train, y_train)

accuracy_random = random.score(x_test, y_test)
print(accuracy_random)

0.9749


In [249]:
prediction_random = random.predict(x_test)
matrix_random = confusion_matrix(y_test, prediction_random)
precision_random = precision_score(y_test, prediction_random, average='macro')
recall_random = recall_score(y_test, prediction_random, average='macro')
f1_random = f1_score(y_test, prediction_random, average='macro')

print(matrix_random)

[[17697   248    21]
 [  477  5067     1]
 [    6     0  6483]]


In [254]:
boosting = GradientBoostingClassifier()
boosting.fit(x_train, y_train)

accuracy_boosting = boosting.score(x_test, y_test)
print(accuracy_boosting)

0.9694


In [255]:
prediction_boosting = boosting.predict(x_test)
matrix_boosting = confusion_matrix(y_test, prediction_boosting)
precision_boosting = precision_score(y_test, prediction_boosting, average='macro')
recall_boosting = recall_score(y_test, prediction_boosting, average='macro')
f1_boosting = f1_score(y_test, prediction_boosting, average='macro')

print(matrix_boosting)

[[17754   191    21]
 [  689  4855     1]
 [   16     0  6473]]


In [256]:
scores = {
    'Models': ['Logistic Regression', 'KNeighbors Classifier', 'Decision Tree Classifier', 'Random Forest Classifier', 'Gradient Boosting Classifier'],
    'Accuracy': [accuracy_logistic, accuracy_neighbors, accuracy_tree, accuracy_random, accuracy_boosting],
    'Presicion': [precision_logistic, precision_neighbors, precision_tree, precision_random, precision_boosting],
    'Recall': [recall_logistic, recall_neighbors, recall_tree, recall_random, recall_boosting],
    'F1_score': [f1_logistic, f1_neighbors, f1_tree, f1_random, f1_boosting]
}

In [258]:
scores_df = pd.DataFrame(scores)
display(scores_df)

Unnamed: 0,Models,Accuracy,Presicion,Recall,F1_score
0,Logistic Regression,0.946367,0.939638,0.939307,0.938348
1,KNeighbors Classifier,0.963333,0.957721,0.95859,0.95788
2,Decision Tree Classifier,0.959433,0.951784,0.953068,0.952422
3,Random Forest Classifier,0.9749,0.974463,0.965966,0.970063
4,Gradient Boosting Classifier,0.9694,0.973523,0.953766,0.962905
