In [None]:
import numpy as np
import matplotlib.pyplot as plt
def gaussian_pdf(x, mean, cov):
    """
    Compute the probability density function of a multivariate Gaussian distribution.

    Parameters:
    x: numpy array of shape (n_features,)
        Data point.
    mean: numpy array of shape (n_features,)
        Mean of the Gaussian distribution.
    cov: numpy array of shape (n_features, n_features)
        Covariance matrix of the Gaussian distribution.

    Returns:
    float
        Probability density function of the Gaussian distribution for the given data point.
    """
    n_features = x.shape[0]
    norm = 1 / np.sqrt((2 * np.pi) ** n_features * np.linalg.det(cov))
    diff = x - mean
    exponent = np.exp(-0.5 * np.dot(np.dot(diff, np.linalg.inv(cov)), diff))
    return norm * exponent


def em_algorithm(X, n_components, max_iter=100, tol=1e-4):
    """
    Implement the EM algorithm for Gaussian Mixture Models.

    Parameters:
    X: numpy array of shape (n_samples, n_features)
        Data points.
    n_components: int
        Number of mixture components.
    max_iter: int, optional
        Maximum number of iterations. Default is 100.
    tol: float, optional
        Tolerance level for convergence. Default is 1e-4.

    Returns:
    numpy array of shape (n_components,)
        Estimated mixing coefficients.
    list of numpy arrays of shape (n_features,)
        Estimated means for each component.
    list of numpy arrays of shape (n_features, n_features)
        Estimated covariances for each component.
    """
    n_samples, n_features = X.shape

    # Initialize the mixing coefficients, means, and covariances
    mixing_coeffs = np.ones(n_components) / n_components
    means = [np.random.randn(n_features) for _ in range(n_components)]
    covariances = [np.eye(n_features) for _ in range(n_components)]

    for _ in range(max_iter):
        # E-step: Compute the responsibilities
        responsibilities = np.zeros((n_samples, n_components))
        for i in range(n_samples):
            for k in range(n_components):
                responsibilities[i, k] = mixing_coeffs[k] * gaussian_pdf(X[i], means[k], covariances[k])
            responsibilities[i] /= np.sum(responsibilities[i])

        # M-step: Update the parameters
        prev_means = means.copy()
        prev_covariances = covariances.copy()
        mixing_coeffs = np.mean(responsibilities, axis=0)
        for k in range(n_components):
            means[k] = np.average(X, axis=0, weights=responsibilities[:, k])
            diff = X - means[k]
            covariances[k] = np.dot((responsibilities[:, k] * diff.T), diff) / np.sum(responsibilities[:, k])
           # means[k] = np.average(X, axis=0, weights=responsibilities[:, k])
            #diff = X - means[k]
            #covariances[k] = (responsibilities[:, k] * np.outer(diff, diff)).sum(axis=0) / np.sum(responsibilities[:, k])

        # Check for convergence
        mean_diff = np.mean([np.linalg.norm(means[k] - prev_means[k]) for k in range(n_components)])
        cov_diff = np.mean([np.linalg.norm(covariances[k] - prev_covariances[k]) for k in range(n_components)])
        if mean_diff < tol and cov_diff < tol:
            break

    return mixing_coeffs, means, covariances


In [None]:
def main():
    # Parameters for generating synthetic data
    # dimentionality
    n_features = 2
    # Vary the number of mixture components
    n_component_values = [2, 4, 5]
    # Vary the size of the sample set
    sample_sizes = [500, 1000, 2000]
    for n_components in n_component_values:
        for n_samples in sample_sizes:
          # Generate synthetic data
          np.random.seed(123)
          component_means = [[-2] * n_features, [2] * n_features, [0] * n_features]  # Modify the mean vectors to have diffrent dimensions(n_features)
          component_covariances = [np.eye(n_features)] * n_components  # Use identity covariance matrices to have diffrent dimensions(n_features)
          data = np.concatenate([np.random.multivariate_normal(mean, cov, int(n_samples / n_components)) for mean, cov in zip(component_means[:n_components], component_covariances[:n_components])])

        # Estimate mixture parameters using the EM algorithm
          mixing_coeffs, means, covariances = em_algorithm(data, n_components)

          # Print the estimated mixture parameters
          print(f"\nSample Size: {n_samples}")
          print(f"\nNumber of Mixture Components: {n_components}")
          print("Estimated Mixing Coefficients:")
          print(mixing_coeffs)
          print("\nEstimated Means:")
          for k in range(n_components):
            print(f"Component {k+1}: {means[k]}")
          print("\nEstimated Covariances:")
          for k in range(n_components):
            print(f"Component {k+1}:\n{covariances[k]}")





# Run the main function
main()


Sample Size: 500

Number of Mixture Components: 2
Estimated Mixing Coefficients:
[0.55755654 0.44244346]

Estimated Means:
Component 1: [-0.8489363  -0.09008342]
Component 2: [0.97450406 0.02998205]

Estimated Covariances:
Component 1:
[[4.20444111 3.99970893]
 [3.99970893 5.11962885]]
Component 2:
[[3.97562297 3.90254778]
 [3.90254778 5.00474527]]

Sample Size: 1000

Number of Mixture Components: 2
Estimated Mixing Coefficients:
[0.50048057 0.49951943]

Estimated Means:
Component 1: [-2.04313822 -2.03353507]
Component 2: [1.9992317  2.02287567]

Estimated Covariances:
Component 1:
[[1.01933424 0.01206615]
 [0.01206615 0.99222951]]
Component 2:
[[ 9.19702689e-01 -6.74430524e-05]
 [-6.74430524e-05  8.95314294e-01]]

Sample Size: 2000

Number of Mixture Components: 2
Estimated Mixing Coefficients:
[0.49942945 0.50057055]

Estimated Means:
Component 1: [2.00786963 2.07304579]
Component 2: [-2.02106055 -2.00494723]

Estimated Covariances:
Component 1:
[[0.9932048  0.05433335]
 [0.05433335

b) Change the mean vectors and the covariance matrices for the mixture components. Does this affect the EM algorithm?
* The changes will impact how the algorithm estimates the parameters and the resulting mixture distribution.

c) Vary the size of the sample set and repeat the scenarios from 1 a) and 1 b). What do you observe?
* As the sample size increases, the EM algorithm tends to have more data to work with and can potentially yield more accurate estimates.

d) Vary the number of mixture components using more or less then used to create the sample set. What do you observe?
*  If we use more components than were used to create the sample set, the algorithm may overfit the data, resulting in poor generalization to new data. Conversely, if we use fewer components, the algorithm may underfit the data, resulting in an oversimplified model that does not capture the true underlying structure of the data.

e) What happens to the EM estimation process if you significantly increase the dimensionality of the distribution?
* It can result in Computational Complexity, Curse of Dimensionality, Increased Parameter Space, Convergence Issues, and Data Sparsity.
