<a href="https://colab.research.google.com/github/JumanaWanass/Star-Classification/blob/main/Star_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [200]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, confusion_matrix, ConfusionMatrixDisplay)

Data preparing

In [201]:
colNames = ['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
            'Absolute magnitude(Mv)', 'Star color', 'Spectral Class', 'Star type']
df = pd.read_csv('StarType.csv', skiprows=1, names=colNames)


In [202]:
class_mapping = {
    0: 'Red Dwarf',
    1: 'Brown Dwarf',
    2: 'White Dwarf',
    3: 'Main Sequence',
    4: 'Supergiant',
    5: 'Hypergiant'
}

df['Star type'] = df['Star type'].map(class_mapping)


Removing outliers

In [203]:
# Calculate the IQR for all numerical columns
Q1 = df.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = df.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1

iqr_multiplier = 1.5

# Create a mask to identify rows with outliers in any feature
outlier_mask = ((df.select_dtypes(include=[np.number]) < (Q1 - iqr_multiplier * IQR)) |
                (df.select_dtypes(include=[np.number]) >
                 (Q3 + iqr_multiplier * IQR))).any(axis=1)

# Remove rows with outliers
df_clean = df[~outlier_mask]
df.describe()


Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv)
count,240.0,240.0,240.0,240.0
mean,10497.4625,107188.361635,237.157781,4.382396
std,9552.425037,179432.24494,517.155763,10.532512
min,1939.0,8e-05,0.0084,-11.92
25%,3344.25,0.000865,0.10275,-6.2325
50%,5776.0,0.0705,0.7625,8.313
75%,15055.5,198050.0,42.75,13.6975
max,40000.0,849420.0,1948.5,20.06


Convert string values to int

In [204]:
encoder = OneHotEncoder(sparse=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(df[['Star color', 'Spectral Class']]))

# Drop the original categorical columns and concatenate the encoded columns
df = df.drop(columns=['Star color', 'Spectral Class'])
X = pd.concat([df, X_encoded], axis=1)

X = X.drop(columns=['Star type'])  # Features (all columns except 'Star type')
y = df['Star type']  # Target variable ('Star type' column)

# Convert feature column names to strings (if not already)
X.columns = X.columns.astype(str)



In [205]:

# Split the data into a train set and a test set (e.g., 90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X.columns = X.columns.astype(str)



Using weighted KNN

In [212]:

# Define parameter grid for KNN
knn_param_grid = {'n_neighbors': range(3, 15, 2)}

weights_to_test = ['uniform', 'distance']

best_knn_weighted = None
best_knn_weighted_accuracy = 0

# Iterate over different weight options
for weight_option in weights_to_test:
    knn_grid = GridSearchCV(KNeighborsClassifier(weights=weight_option),
                            knn_param_grid, cv=3)
    knn_grid.fit(X_train, y_train)

    bestKnn = knn_grid.best_estimator_

    bestKnn.fit(X_train, y_train)

    y_pred = bestKnn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    # Check if this model has better accuracy
    if accuracy > best_knn_weighted_accuracy:
        best_knn_weighted_accuracy = accuracy
        best_knn_weighted = bestKnn

# Print the best weighted KNN classifier
print("Best Weighted KNN Classifier:", best_knn_weighted)
print("Accuracy of Best Weighted KNN:", best_knn_weighted_accuracy)


Best Weighted KNN Classifier: KNeighborsClassifier(n_neighbors=3, weights='distance')
Accuracy of Best Weighted KNN: 0.75


In [213]:
# Train the best KNN model
bestKnn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bestKnn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.75


Using Adaboost

In [214]:
adaboost_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    random_state=42  # Set a random seed for reproducibility
)


In [216]:
# Define a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of weak learners (iterations)
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate (adjust as needed)
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(
    estimator=adaboost_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

best_adaboost_model = grid_search.best_estimator_

print("Best Hyperparameters:", grid_search.best_params_)

cross_val_scores = cross_val_score(best_adaboost_model, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cross_val_scores)
print("Mean Cross-Validation Score:", cross_val_scores.mean())

best_adaboost_model.fit(X_train, y_train)

y_pred = best_adaboost_model.predict(X_test)

Best Hyperparameters: {'learning_rate': 0.5, 'n_estimators': 50}
Cross-Validation Scores: [1.         0.97674419 1.         1.         1.        ]
Mean Cross-Validation Score: 0.9953488372093023


In [217]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: " , accuracy)

Accuracy:  1.0
