Implement the K-Means Clustering and Principal Component Analysis algorithms from scratch in Python using Numpy and Pandas and Matplotlib for visualization.

The algorithm must be implemented as a function with arguments (dataset)

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def k_means_clustering(dataset, k=3, n_init=10, max_iter=300):
    # Define a function to calculate the Euclidean distance between two points
    def euclidean_distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    # Define a function to initialize the centroids
    def initialize_centroids(Iris Dataset, k):
        centroids = dataset.copy()
        np.random.shuffle(centroids)
        return centroids[:k]
    
    # Define a function to assign each data point to the closest centroid
    def assign_clusters(dataset, centroids):
        clusters = []
        for i in range(len(dataset)):
            distances = [euclidean_distance(dataset[i], centroid) for centroid in centroids]
            cluster = np.argmin(distances)
            clusters.append(cluster)
        return clusters
    
    # Define a function to update the centroids based on the mean of the assigned data points
    def update_centroids(dataset, clusters, k):
        centroids = np.zeros((k, dataset.shape[1]))
        for i in range(k):
            centroids[i] = np.mean(dataset[clusters == i], axis=0)
        return centroids
    
    # Initialize the best cost and best centroids
    best_cost = float('inf')
    best_centroids = None
    
    # Run the K-Means algorithm multiple times with different initializations
    for i in range(n_init):
        # Initialize the centroids
        centroids = initialize_centroids(dataset, k)
        
        # Run the K-Means algorithm
        for j in range(max_iter):
            # Assign each data point to the closest centroid
            clusters = assign_clusters(dataset, centroids)
            
            # Update the centroids based on the mean of the assigned data points
            new_centroids = update_centroids(dataset, clusters, k)
            
            # Check if the centroids have changed
            if np.all(centroids == new_centroids):
                break
            
            # Update the centroids
            centroids = new_centroids
        
        # Calculate the cost of the current solution
        cost = 0
        for i in range(len(dataset)):
            cost += euclidean_distance(dataset[i], centroids[clusters[i]]) ** 2
        
        # Check if the current solution is better than the best solution found so far
        if cost < best_cost:
            best_cost = cost
            best_centroids = centroids
    
    # Return the best solution found
    return assign_clusters(dataset, best_centroids), best_centroids

def principal_component_analysis(dataset):
    # Calculate the mean of each feature
    mean = np.mean(dataset, axis=0)
    
    # Center the data by subtracting the mean from each data point
    centered_data = dataset - mean
    
    # Calculate the covariance matrix of the centered data
    covariance_matrix = np.cov(centered_data.T)
    
    # Calculate the eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    
    # Sort the eigenvalues and eigenvectors in descending order of eigenvalue magnitude
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Project the data onto the first three principal components
    projected_data = centered_data.dot(eigenvectors[:, :3])
    
    return projected_data, eigenvalues[:3]

# Load and prepare the Iris dataset
iris = load_iris()
data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
species_column=data['target']
dataset=data.drop('target',axis=1)

# Perform K-Means Clustering on the Iris dataset
clusters, _ = k_means_clustering(dataset.values)

# Perform Principal Component Analysis on the Iris dataset
projected_data, eigenvalues = principal_component_analysis(dataset.values)

# Create a 3D scatter plot to visualize the results of Principal Component Analysis and K-Means Clustering

fig=plt.figure(figsize=(15,5))
ax1=fig.add_subplot(131,projection='3d')
ax1.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=clusters)
ax1.set_title('K-Means Clustering')
ax1.set_xlabel('PC1')
ax1.set_ylabel('PC2')
ax1.set_zlabel('PC3')

ax2=fig.add_subplot(132)
ax2.bar(['PC1', 'PC2', 'PC3'], eigenvalues)
ax2.set_title('Principal Component Analysis')
ax2.set_ylabel('Eigenvalue')

ax3=fig.add_subplot(133,projection='3d')
ax3.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=species_column)
ax3.set_title('Actual Species')
ax3.set_xlabel('PC1')
ax3.set_ylabel('PC2')
ax3.set_zlabel('PC3')

plt.show()


SyntaxError: invalid syntax (3549128650.py, line 13)