In [None]:
import pandas as pd

# Load the dataset from the downloaded CSV file
mcdonalds_path = '../input/mcdonalds/mcdonalds.csv'
mcdonalds = pd.read_csv(mcdonalds_path)

# Display column names
print(mcdonalds.columns.tolist())

# Display dimensions
print(mcdonalds.shape)

# Display first 3 rows
print(mcdonalds.head(3))


In [None]:
import pandas as pd
import numpy as np

# Assuming you have already loaded the data into a pandas DataFrame named 'mcdonalds'
# If not, load the data using pd.read_csv() or other appropriate functions

# Select columns from 1 to 11 (indexing starts from 0 in Python)
MD_x = mcdonalds.iloc[:, 0:11].copy()

# Convert "Yes" to 1 and "No" to 0
MD_x = (MD_x == "Yes").astype(int)

# Calculate column means
column_means = np.round(MD_x.mean(), 2)

print(column_means)


In [None]:
from sklearn.decomposition import PCA

# Assuming you have already defined MD_x and column_means as in the previous code snippet

# Perform PCA
MD_pca = PCA()
MD_pca.fit(MD_x)

# Display summary
print("Importance of components:")
print(pd.DataFrame({
    "Standard deviation": np.round(MD_pca.explained_variance_, 4),
    "Proportion of Variance": np.round(MD_pca.explained_variance_ratio_, 4),
    "Cumulative Proportion": np.round(np.cumsum(MD_pca.explained_variance_ratio_), 4)
}))


In [None]:
# Assuming you have already defined MD_pca as in the previous code snippet

# Function to print PCA object with specified number of digits
def print_pca(pca_obj, digits):
    print("Standard deviations (1, .., p={}):".format(pca_obj.n_components_))
    print(np.round(pca_obj.explained_variance_, digits))
    print("Rotation (n x k) = ({} x {}):".format(pca_obj.components_.shape[1], pca_obj.components_.shape[0]))
    print(np.round(pca_obj.components_, digits))

# Print PCA object with specified number of digits
print_pca(MD_pca, digits=1)


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming you have already defined MD_x and MD_pca as in the previous code snippets

# Perform PCA
MD_pca = PCA()
MD_pca.fit(MD_x)

# Transform data using PCA
transformed_data = MD_pca.transform(MD_x)

# Plot PCA
plt.scatter(transformed_data[:, 0], transformed_data[:, 1], color='grey')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Plot")
plt.show()


In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Set random seed
np.random.seed(1234)

# Assuming you have already defined MD_x as in the previous code snippets

# Perform K-means clustering with 2 to 8 clusters
k_values = range(2, 9)
best_model = None
best_score = float('inf')

for k in k_values:
    model = KMeans(n_clusters=k, n_init=10, random_state=1234)
    model.fit(MD_x)
    if model.inertia_ < best_score:
        best_model = model
        best_score = model.inertia_

# Relabel the clusters
cluster_labels = best_model.labels_

# Print cluster labels
print(cluster_labels)


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Assuming you have already defined MD_x as in the previous code snippets

# Fit K-means clustering with a chosen number of clusters
k_values = range(2, 9)  # Choose the range of k values to try
inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=1234)
    kmeans.fit(MD_x)
    inertia_values.append(kmeans.inertia_)

# Plot inertia (within-cluster sum of squares) vs. number of clusters
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method to Choose Number of Clusters")
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder

# Assuming you have already defined MD_x as a DataFrame containing categorical variables

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first category to avoid multicollinearity
MD_x_encoded = encoder.fit_transform(MD_x)

# Set random seed
np.random.seed(1234)

# Bootstrapping parameters
n_bootstraps = 100
k_values = range(2, 9)
n_rep = 10

# Initialize a list to store bootstrap results
boot_results = []

# Bootstrap loop
for _ in range(n_bootstraps):
    # Generate bootstrap sample indices with replacement
    indices = np.random.choice(len(MD_x_encoded), size=len(MD_x_encoded), replace=True)
    bootstrap_sample = MD_x_encoded[indices]

    # Perform K-means clustering with a chosen number of clusters
    best_model = None
    best_score = float('inf')
    for k in k_values:
        model = KMeans(n_clusters=k, n_init=n_rep, random_state=1234)
        model.fit(bootstrap_sample)
        if model.inertia_ < best_score:
            best_model = model
            best_score = model.inertia_

    # Store the best model for this bootstrap iteration
    boot_results.append(best_model)

# Print the results
for i, model in enumerate(boot_results):
    print(f"Bootstrap {i+1} - Number of Clusters: {model.n_clusters}")

# Note: You can analyze the results further as needed.
