In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Function to extract image width and height features
def extract_image_features(image_path):
    img = Image.open(image_path)
    img = img.resize((150, 150))  # Resize to a fixed size
    width, height = img.size
    return width, height

# Directory paths and classes
train_dir = '/mnt/data/ML_DATASET_HENRY/Training'
test_dir = '/mnt/data/ML_DATASET_HENRY/Testing'
classes = ['glioma', 'meningioma', 'notumor', 'pituitary']

# Initialize lists to hold features and labels
X_train, X_test = [], []

# Process training and testing images
for class_name in classes:
    for image_name in os.listdir(os.path.join(train_dir, class_name)):
        image_path = os.path.join(train_dir, class_name, image_name)
        X_train.append(extract_image_features(image_path))

    for image_name in os.listdir(os.path.join(test_dir, class_name)):
        image_path = os.path.join(test_dir, class_name, image_name)
        X_test.append(extract_image_features(image_path))

# Convert to NumPy arrays and merge train/test sets
X_train = np.array(X_train)
X_test = np.array(X_test)
X_combined = np.vstack((X_train, X_test))

# Standardize features
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Apply k-means clustering with k=5
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_combined_scaled)

# Output cluster centers and labels
print(f"Cluster Centers: \n{kmeans.cluster_centers_}")
print(f"Cluster Labels: {kmeans.labels_}")


FileNotFoundError: [WinError 3] The system cannot find the path specified: '/mnt/data/ML_DATASET_HENRY/Training\\glioma'

In [2]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Elbow method to find the optimal k
inertia = []
k_range = range(1, 31)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_combined_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()


NameError: name 'X_combined_scaled' is not defined

In [3]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

# Perform agglomerative clustering
linked = linkage(X_combined_scaled, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.show()

# Applying Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=5)
agg_clustering.fit(X_combined_scaled)

# Output the cluster labels
print(f"Agglomerative Clustering Labels: {agg_clustering.labels_}")


NameError: name 'X_combined_scaled' is not defined

In [4]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# Logistic regression as the base model for feature selection
log_reg = LogisticRegression(max_iter=1000)

# Sequential Forward Selection (SFS)
sfs = SequentialFeatureSelector(log_reg, n_features_to_select=1, direction='forward')
sfs.fit(X_combined_scaled, kmeans.labels_)

# Display the selected features
selected_features = sfs.get_support()
print(f"Selected features: {selected_features}")


NameError: name 'X_combined_scaled' is not defined

In [5]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_combined_scaled)

# Plot the explained variance ratio
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Explained Variance vs. Number of Components')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.show()

# Find the number of components that capture 95% variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
print(f"Number of components to capture 95% variance: {n_components}")

# Transform the data with the top K principal components
pca = PCA(n_components=n_components)
X_pca_transformed = pca.fit_transform(X_combined_scaled)

# Split the data into training and test sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_transformed, kmeans.labels_, test_size=0.2, random_state=42)

# Train logistic regression on PCA-transformed data
log_reg_pca = LogisticRegression(max_iter=1000)
log_reg_pca.fit(X_train_pca, y_train)

# Predict and calculate accuracy
y_pred_pca = log_reg_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Accuracy with PCA-transformed data: {accuracy_pca:.4f}")


NameError: name 'X_combined_scaled' is not defined