<a href="https://colab.research.google.com/github/NadineNjora/-AI-Coursework-2025/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:

import os
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

# For Colab upload
from google.colab import files

In [None]:

print("Please upload '6 class csv.csv' or 'Stars.csv'")
uploaded = files.upload()

Please upload '6 class csv.csv' or 'Stars.csv'


In [None]:

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())
df.head()

In [None]:
#  DATA CLEANING

df.columns = df.columns.str.strip()

type_mapping = {
    'Brown Dwarf': 0, 'Red Dwarf': 1, 'White Dwarf': 2,
    'Main Sequence': 3, 'Supergiant': 4, 'Hypergiant': 5
}
# Map 'Star category' (string labels) to numerical 'Type'
if 'Star category' in df.columns:
    df['Type'] = df['Star category'].map(type_mapping)

# Drop original 'Star type' (if it exists) and 'Star category' columns as 'Type' now holds the classification
if 'Star type' in df.columns:
    df = df.drop('Star type', axis=1)
if 'Star category' in df.columns:
    df = df.drop('Star category', axis=1)

# Encode other categorical columns
le = LabelEncoder()
if 'Star color' in df.columns:
    df['Star color'] = le.fit_transform(df['Star color'].astype(str))
if 'Spectral Class' in df.columns:
    df['Spectral Class'] = le.fit_transform(df['Spectral Class'].astype(str))

# Drop duplicates
df.drop_duplicates(inplace=True)

df.head()

In [None]:
### Insight 1: Distribution of Star Types
plt.figure(figsize=(8,5))
sns.countplot(x='Type', data=df, palette='viridis')
plt.title('Distribution of Star Types')
plt.xlabel('Star Type (0=Brown Dwarf → 5=Hypergiant)')
plt.show()

print("Observation: Perfectly balanced dataset — exactly 40 stars per class. No bias issues.")

In [None]:
### Insight 2: Correlation Heatmap
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10,7))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

print("Observation: Absolute magnitude and Luminosity have strong negative correlation (-0.70). This is expected from physics.")

In [None]:

df = df.rename(columns={
    'A_M': 'Absolute magnitude(Mv)',
    'AM': 'Absolute magnitude(Mv)',
    'Absolute magnitude': 'Absolute magnitude(Mv)',
    'absolute magnitude': 'Absolute magnitude(Mv)',
    'Absolute Magnitude (Mv)': 'Absolute magnitude(Mv)'
})

print("Column fixed! Now your columns are:")
print(df.columns.tolist())

In [None]:
### Insight 3: Temperature vs Absolute Magnitude (The Magic Plot)
plt.figure(figsize=(10,7))
sns.scatterplot(x='Temperature (K)', y='Absolute magnitude (Mv)', hue='Type', data=df, palette='deep', s=100)
plt.title('Temperature vs Absolute Magnitude by Star Type')
plt.legend(title='Star Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.show()

print("Observation: All 6 star types form completely separate clusters — classification is very easy!")

In [None]:
### Insight 4: Radius Distribution by Star Type
plt.figure(figsize=(10,6))
sns.boxplot(x='Type', y='Radius (R/Ro)', data=df, palette='Set2')
plt.title('Radius Distribution by Star Type')
plt.yscale('log')  # because values are huge
plt.ylabel('Radius (log scale)')
plt.show()

print("Observation: Supergiants and Hypergiants have extremely large radii — some are thousands of times bigger than the Sun.")

In [None]:
# CELL: PREPARE DATA FOR MODELING
features = ['Temperature (K)', 'Absolute magnitude (Mv)']  # best two features
X = df[features]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training samples:", len(X_train))
print("Test samples:", len(X_test))

In [None]:
### Q3: k-Nearest Neighbours + Optimal k
k_values = range(1, 21)
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    pred = knn.predict(X_test_scaled)
    accuracies.append(accuracy_score(y_test, pred))

plt.figure(figsize=(8,5))
plt.plot(k_values, accuracies, marker='o')
plt.title('KNN Accuracy vs k')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

best_k = np.argmax(accuracies) + 1
print(f"Optimal k = {best_k} → Accuracy = {max(accuracies):.4f}")

# Final KNN model
knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)
knn_acc = accuracy_score(y_test, knn_pred)
print(f"\nFinal KNN Test Accuracy: {knn_acc:.4f}")

In [None]:
### Q4: Default SVM (Our Assigned Eager Learner)
svm_default = SVC(random_state=42)
svm_default.fit(X_train_scaled, y_train)
svm_default_pred = svm_default.predict(X_test_scaled)

svm_default_acc = accuracy_score(y_test, svm_default_pred)
print(f"Default SVM Accuracy: {svm_default_acc:.4f}")

In [None]:
### Q5: Tuned SVM (Hyperparameter Tuning)
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.01, 0.1]
}

grid = GridSearchCV(SVC(random_state=42), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print("Best parameters found:", grid.best_params_)

svm_tuned = grid.best_estimator_
svm_tuned_pred = svm_tuned.predict(X_test_scaled)
svm_tuned_acc = accuracy_score(y_test, svm_tuned_pred)
print(f"Tuned SVM Accuracy: {svm_tuned_acc:.4f}")

In [None]:
### Q6: Model Comparison — Confusion Matrices
def plot_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=range(6), yticklabels=range(6))
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

print("KNN (k=", best_k, ") Accuracy:", f"{knn_acc:.4f}")
plot_cm(y_test, knn_pred, f'KNN (k={best_k})')

print("\nDefault SVM Accuracy:", f"{svm_default_acc:.4f}")
plot_cm(y_test, svm_default_pred, 'Default SVM')

print("\nTuned SVM Accuracy:", f"{svm_tuned_acc:.4f}")
plot_cm(y_test, svm_tuned_pred, 'Tuned SVM')