# 1)

In [1]:
import numpy as np
from scipy.stats import multivariate_normal

# Given data points
data = np.array([
    [1, 0.6, 0.1],
    [0, -0.4, 0.8],
    [0, 0.2, 0.5],
    [1, 0.4, -0.1]
])

# Given parameters
pi = np.array([0.5, 0.5])
p = np.array([0.3, 0.7])
N1 = {
    'mean': np.array([1, 1]),
    'cov': np.array([[2, 0.5], [0.5, 2]])
}
N2 = {
    'mean': np.array([0, 0]),
    'cov': np.array([[1.5, 1], [1, 1.5]])
}

# Initialize arrays to store posterior probabilities
posterior_probs = np.zeros((len(data), 2))

# E-step: Compute posterior probabilities
for i in range(len(data)):
    likelihood1 = p[0]**data[i, 0] * (1-p[0])**(1-data[i, 0])
    likelihood2 = p[1]**data[i, 0] * (1-p[1])**(1-data[i, 0])
    
    likelihood_gaussian1 = multivariate_normal.pdf(data[i, 1:], mean=N1['mean'], cov=N1['cov'])
    likelihood_gaussian2 = multivariate_normal.pdf(data[i, 1:], mean=N2['mean'], cov=N2['cov'])

    denominator = pi[0] * likelihood1 * likelihood_gaussian1 + pi[1] * likelihood2 * likelihood_gaussian2
    
    posterior_probs[i, 0] = (pi[0] * likelihood1 * likelihood_gaussian1) / denominator
    posterior_probs[i, 1] = (pi[1] * likelihood2 * likelihood_gaussian2) / denominator

print("Posterior Probabilities:\n", posterior_probs)


ValueError: operands could not be broadcast together with shapes (4,3) (2,) 

# 2)

In [3]:
import numpy as np
from scipy.stats import multivariate_normal, bernoulli

# Given model parameters
π1 = 0.5
π2 = 0.5
p1 = 0.3
p2 = 0.7
μ1 = np.array([1, 1])
Σ1 = np.array([[2, 0.5], [0.5, 2]])
μ2 = np.array([0, 0])
Σ2 = np.array([[1.5, 1], [1, 1.5]])

# New observation
x_new = np.array([1, 0.3, 0.7])

# Calculate the likelihoods for the new observation
likelihood_cluster1 = (
    bernoulli.pmf(1, p1) *
    multivariate_normal.pdf(x_new[1:], mean=μ1, cov=Σ1)
)

likelihood_cluster2 = (
    bernoulli.pmf(1, p2) *
    multivariate_normal.pdf(x_new[1:], mean=μ2, cov=Σ2)
)

# Calculate unnormalized posteriors
unnormalized_posterior1 = π1 * likelihood_cluster1
unnormalized_posterior2 = π2 * likelihood_cluster2

# Normalize the posteriors
posterior1 = unnormalized_posterior1 / (unnormalized_posterior1 + unnormalized_posterior2)
posterior2 = unnormalized_posterior2 / (unnormalized_posterior1 + unnormalized_posterior2)

print("Posterior for Cluster 1:", posterior1)
print("Posterior for Cluster 2:", posterior2)

Posterior for Cluster 1: 0.20697271237150058
Posterior for Cluster 2: 0.7930272876284995


# 3)

In [8]:
import numpy as np
from scipy.stats import multivariate_normal

# Given observations
observations = np.array([
    [1, 0.6, 0.1],
    [0, -0.4, 0.8],
    [0, 0.2, 0.5],
    [1, 0.4, -0.1]
])

# Given parameters
pi = np.array([0.5, 0.5])
p = np.array([0.3, 0.7])
mean = np.array([[1, 1], [0, 0]])
cov = np.array([[[2, 0.5], [0.5, 2]], [[1.5, 1], [1, 1.5]]])

# Step 1: Calculate the likelihood of each observation belonging to each cluster
likelihoods = np.zeros((len(observations), len(pi)))
for i in range(len(observations)):
    for j in range(len(pi)):
        likelihoods[i, j] = pi[j] * p[j] ** observations[i, 0] * (1 - p[j]) ** (1 - observations[i, 0]) * \
                            multivariate_normal.pdf(observations[i, 1:], mean[j], cov[j])

# Step 2: Assign each observation to the cluster with the highest likelihood
assignments = np.argmax(likelihoods, axis=1)

# Now, we have assigned each observation to a cluster (0 or 1).

# Step 3: Calculate the Manhattan distance between each observation and all other observations in the same cluster
manhattan_distances = np.zeros((len(observations), len(observations)))
for i in range(len(observations)):
    for j in range(len(observations)):
        manhattan_distances[i, j] = np.abs(observations[i] - observations[j]).sum()

# Step 4: Calculate the average distance of each observation to all other observations in the same cluster
average_distances_same_cluster = np.zeros(len(observations))
for i in range(len(observations)):
    same_cluster_indices = np.where(assignments == assignments[i])[0]
    same_cluster_distances = manhattan_distances[i, same_cluster_indices]
    average_distances_same_cluster[i] = np.mean(same_cluster_distances)

# Step 5: Calculate the average distance of each observation to all observations in the other cluster
average_distances_other_cluster = np.zeros(len(observations))
for i in range(len(observations)):
    other_cluster_indices = np.where(assignments != assignments[i])[0]
    other_cluster_distances = manhattan_distances[i, other_cluster_indices]
    average_distances_other_cluster[i] = np.mean(other_cluster_distances)

# Step 6: Calculate the silhouette score for each observation
silhouette_scores = (average_distances_other_cluster - average_distances_same_cluster) / \
                    np.maximum(average_distances_other_cluster, average_distances_same_cluster)

# Step 7: Calculate the average silhouette score for the larger cluster
larger_cluster = np.argmax([np.sum(assignments == 0), np.sum(assignments == 1)])
average_silhouette_larger_cluster = np.mean(silhouette_scores[assignments == larger_cluster])

print("Silhouette score of the larger cluster under Manhattan distance:", average_silhouette_larger_cluster)



Silhouette score of the larger cluster under Manhattan distance: 0.7916666666666665
