In [210]:
import pandas as pd
import numpy as np

In [211]:
df = pd.read_csv('/Users/leona/OneDrive/Documentos/GitHub/Stellar-Classification/data/star_classification.csv')

In [212]:
df.drop(columns=['obj_ID', 'rerun_ID', 'run_ID', 'field_ID', 'spec_obj_ID', 'fiber_ID', 'plate', 'cam_col', 'MJD'], axis=1, inplace=True)

In [213]:
x = df.drop('class', axis=1)
y = df['class']
print(x.shape, y.shape)

(100000, 8) (100000,)


In [214]:
from sklearn.preprocessing import LabelEncoder

In [215]:
le = LabelEncoder()
y = le.fit_transform(y)

In [216]:
from sklearn.feature_selection import SelectKBest, f_classif

In [217]:
algorithm = SelectKBest(score_func=f_classif, k=3)
x_bests = algorithm.fit_transform(x, y)

print(f'Score: {algorithm.scores_}')
print(f'Result of this transformation: {x_bests}')

Score: [2.19488222e+01 2.17588357e+02 3.04453390e+01 2.59625226e+01
 4.58453336e+03 8.28234355e+03 3.23283079e+01 8.34294190e+04]
Result of this transformation: [[20.39501   19.16573    0.6347936]
 [22.58444   21.16812    0.779136 ]
 [20.60976   19.34857    0.6441945]
 ...
 [18.20428   17.69034    0.1433656]
 [19.91386   19.07254    0.4550396]
 [20.60115   20.00959    0.5429442]]


In [218]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler

In [219]:
norm = StandardScaler()
x_norm = norm.fit_transform(x_bests)

In [220]:
x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.30, shuffle=True, random_state=15)

In [221]:
logistic = LogisticRegression(max_iter=2000, tol=0.01)
logistic.fit(x_train, y_train)

accuracy_logistic = logistic.score(x_test, y_test)
print(accuracy_logistic)

0.9463666666666667


In [222]:
prediction_logistic = logistic.predict(x_test)
matrix_logistic = confusion_matrix(y_test, prediction_logistic)
recall_logistic = recall_score(y_test, prediction_logistic, average='macro')
precision_logistic = precision_score(y_test, prediction_logistic, average='macro')
f1_logistic = f1_score(y_test, prediction_logistic, average='macro')

print(matrix_logistic)

[[17102   273   591]
 [  725  4814     6]
 [    2    12  6475]]


In [223]:
neighbors = KNeighborsClassifier()
neighbors.fit(x_train, y_train)

accuracy_neighbors = neighbors.score(x_test, y_test)
print(accuracy_neighbors)

0.9633333333333334


In [224]:
prediction_neighbors = neighbors.predict(x_test)
matrix_neighbors = confusion_matrix(y_test, prediction_neighbors)
precision_neighbors = precision_score(y_test, prediction_neighbors, average='macro')
recall_neighbors = recall_score(y_test, prediction_neighbors, average='macro')
f1_neighbors = f1_score(y_test, prediction_neighbors, average='macro')

print(matrix_neighbors)

[[17377   274   315]
 [  479  5061     5]
 [   27     0  6462]]


In [225]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

accuracy_tree = tree.score(x_test, y_test)
print(accuracy_tree)

0.9598


In [226]:
prediction_tree = tree.predict(x_test)
matrix_tree = confusion_matrix(y_test, prediction_tree)
precision_tree = precision_score(y_test, prediction_tree, average='macro')
recall_tree = recall_score(y_test, prediction_tree, average='macro')
f1_tree = f1_score(y_test, prediction_tree, average='macro')

print(matrix_tree)

[[17341   598    27]
 [  554  4990     1]
 [   26     0  6463]]


In [227]:
random = RandomForestClassifier()
random.fit(x_train, y_train)

accuracy_random = random.score(x_test, y_test)
print(accuracy_random)

0.9748333333333333


In [228]:
prediction_random = random.predict(x_test)
matrix_random = confusion_matrix(y_test, prediction_random)
precision_random = precision_score(y_test, prediction_random, average='macro')
recall_random = recall_score(y_test, prediction_random, average='macro')
f1_random = f1_score(y_test, prediction_random, average='macro')

print(matrix_random)

[[17696   247    23]
 [  478  5066     1]
 [    6     0  6483]]


In [229]:
scores = {
    'Models': ['Logistic Regression', 'KNeighbors Classifier', 'Decision Tree Classifier', 'Random Forest Classifier'],
    'Accuracy': [accuracy_logistic, accuracy_neighbors, accuracy_tree, accuracy_random],
    'Presicion': [precision_logistic, precision_neighbors, precision_tree, precision_random],
    'Recall': [recall_logistic, recall_neighbors, recall_tree, recall_random],
    'F1_score': [f1_logistic, f1_neighbors, f1_tree, f1_random]
}

In [230]:
scores_df = pd.DataFrame(scores)
print(scores_df)

                     Models  Accuracy  Presicion    Recall  F1_score
0       Logistic Regression  0.946367   0.939638  0.939307  0.938348
1     KNeighbors Classifier  0.963333   0.957721  0.958590  0.957880
2  Decision Tree Classifier  0.959800   0.952102  0.953705  0.952899
3  Random Forest Classifier  0.974833   0.974400  0.965888  0.969990
