# Simple Supervised Learning Example

## Linear regression

In [None]:
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
# import numpy as np
# import pandas as pd

## Artificially generated random male and female height and weight data

In [None]:
# # Number of data points for each gender
# n_gender = 100

In [None]:
# # Generate data for males
# male_heights = np.random.normal(175, 7, n_gender)
# male_weights = np.random.normal(70, 10, n_gender)

In [None]:
# # Generate data for females
# female_heights = np.random.normal(162, 6, n_gender)
# female_weights = np.random.normal(58, 8, n_gender)

In [None]:
# # Combine the data
# heights = np.concatenate([male_heights, female_heights])
# weights = np.concatenate([male_weights, female_weights])
# genders = ['Male'] * n_gender + ['Female'] * n_gender

In [None]:
# # Create a dataframe to store the data
# df_hw = pd.DataFrame({'Gender': genders, 'Height (cm)': heights, 'Weight (kg)': weights})

In [None]:
# df_hw.head(200)

In [None]:
# # Plot the data
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(12,5))
# sns.scatterplot(data=df_hw, x="Height (cm)", y="Weight (kg)")

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# # Extracting height and weight data
# X = df_hw['Height (cm)'].values.reshape(-1, 1)
# y = df_hw['Weight (kg)'].values

In [None]:
# # Fit linear regression model
# lr = LinearRegression().fit(X, y)

In [None]:
# # Predicted weights based on the model
# y_pred = lr.predict(X)

In [None]:
# # Calculate the mean squared error
# mse = mean_squared_error(y, y_pred)

In [None]:
# # Calculate the R-squared value
# r2 = r2_score(y, y_pred)

In [None]:
# # Extracting the slope (coefficient) and intercept
# slope = lr.coef_[0]
# intercept = lr.intercept_

In [None]:
# Formatting the results
output = {
    "Regression Equation": f"Weight (kg) = {slope:.4f} * Height (cm) + {intercept:.4f}",
    "Slope (Coefficient)": f"{slope:.4f}",
    "Intercept": f"{intercept:.4f}",
    "Mean Squared Error": f"{mse:.4f}",
    "R-squared (Coefficient of Determination)": f"{r2:.4f}"
}

In [None]:
# output

In [None]:
# # plot the regression line
# plt.figure(figsize=(12,5))
# sns.regplot(data=df_hw, x="Height (cm)", y="Weight (kg)", line_kws={"color": "red"})

# Simple Unsupervised Learning Example

## Linear regression

In [None]:
# plt.figure(figsize=(12,5))
# sns.scatterplot(data=df_hw, x="Height (cm)", y="Weight (kg)", hue="Gender")

In [None]:
# from sklearn.cluster import KMeans

In [None]:
# # Extract height and weight as features for clustering
# X_clustering = df_hw[['Height (cm)', 'Weight (kg)']]

In [None]:
# # Apply KMeans clustering with k=2
# kmeans = KMeans(n_clusters=2, random_state=42)
# df_hw['Cluster'] = kmeans.fit_predict(X_clustering)

In [1]:
# from sklearn.metrics import davies_bouldin_score

# for k in range(2, 10):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     cluster_labels = kmeans.fit_predict(X)
#     score = davies_bouldin_score(X, cluster_labels)
#     print(f"For n_clusters={k}, the Davies-Bouldin score is {score}")

In [None]:
# Generate the provided contour plot for the clusters

plt.figure(figsize=(12,8))
sns.kdeplot(data=df_hw, x="Height (cm)", y="Weight (kg)", hue="Gender", cmap="coolwarm", fill=True, thresh=0.05, levels=100, alpha=0.5)
sns.scatterplot(data=df_hw, x="Height (cm)", y="Weight (kg)", hue="Cluster", style="Gender", markers=["s", "D"], edgecolor='w', s=80)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', edgecolor='black', linewidth=1.5, label='Centroids')
plt.title("KMeans Clustering Results with Contours", fontsize=16)
plt.legend()
plt.show()

In [None]:
import pandas as pd
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score, normalized_mutual_info_score

# Extracting and calculating the metrics
predicted_labels = df_hw['Cluster']
true_labels = df_hw['Gender'].map({'Male': 0, 'Female': 1})

silhouette = silhouette_score(X_clustering, predicted_labels)
inertia = kmeans.inertia_
davies_bouldin = davies_bouldin_score(X_clustering, predicted_labels)
ari = adjusted_rand_score(true_labels, predicted_labels)
nmi = normalized_mutual_info_score(true_labels, predicted_labels)

metric_results = {
    "Silhouette Score": silhouette,
    "Inertia": inertia,
    "Davies-Bouldin Index": davies_bouldin,
    "Adjusted Rand Index (ARI)": ari,
    "Normalized Mutual Information (NMI)": nmi
}

# Displaying the metric results with their interpretations
metric_results_with_description = {
    "Metric": list(metric_results.keys()),
    "Value": list(metric_results.values()),
    "Interpretation": [
        "Higher is better (range: -1 to 1)",
        "Lower is better",
        "Lower is better",
        "Higher is better (range: -1 to 1)",
        "Higher is better (range: 0 to 1)"
    ]
}

metric_df = pd.DataFrame(metric_results_with_description)
metric_df

1. **Silhouette Score**:
    - Value: \(0.4432\)
    - Interpretation: Closer to 1 is better. This score suggests a moderate clustering quality.
    - Description: Measures the similarity of an object to its own cluster compared to other clusters. Range: [-1, 1].
    
2. **Inertia**:
    - Value: \(18899.00\)
    - Interpretation: Lower values indicate better clustering.
    - Description: Represents the within-cluster sum of squares. Lower values indicate that data points are closer to the centroids of their respective clusters.
    
3. **Davies-Bouldin Index**:
    - Value: \(0.8522\)
    - Interpretation: Lower is better. This value suggests the clusters are relatively well-separated.
    - Description: Measures the average similarity ratio of each cluster with its most similar cluster. Range: [0, \(\infty\)), where 0 is the ideal.
    
4. **Adjusted Rand Index (ARI)**:
    - Value: \(0.5161\)
    - Interpretation: Closer to 1 is better. This value indicates a moderate to good match between the true labels and the clustering.
    - Description: Measures the similarity of the true and predicted labels, adjusted for chance. Range: [-1, 1].
    
5. **Normalized Mutual Information (NMI)**:
    - Value: \(0.4702\)
    - Interpretation: Closer to 1 is better. This suggests a moderate mutual information between the true and predicted labels.
    - Description: Measures the mutual information between true and predicted labels, normalized by their entropies. Range: [0, 1].

## Testing

In [None]:
# # Generating data for 20 males and 20 females using the provided code
# n_gender = 10

In [None]:
# # Generate data for males
# male_heights = np.random.normal(175, 7, n_gender)
# male_weights = np.random.normal(70, 10, n_gender)

In [None]:
# # Generate data for females
# female_heights = np.random.normal(162, 6, n_gender)
# female_weights = np.random.normal(58, 8, n_gender)

In [None]:
# # Combine the data
# heights = np.concatenate([male_heights, female_heights])
# weights = np.concatenate([male_weights, female_weights])
# genders = ['Male 0'] * n_gender + ['Female 1'] * n_gender

In [None]:
# # Convert to DataFrame
# new_samples_df = pd.DataFrame({'Height (cm)': heights, 'Weight (kg)': weights, 'True Gender': genders})

In [None]:
# # Predict the cluster labels for the new samples
# new_samples_predictions = kmeans.predict(new_samples_df[['Height (cm)', 'Weight (kg)']])

In [None]:
# # Adding predictions to the DataFrame
# new_samples_df['Predicted Cluster'] = new_samples_predictions
# new_samples_df.head(20)  # Displaying the first 10 rows for a preview


In [None]:
plt.figure(figsize=(10, 6))

# Plotting the original data with cluster labels
plt.scatter(df_hw['Height (cm)'], df_hw['Weight (kg)'], c=df_hw['Cluster'], cmap='viridis', alpha=0.6, s=50, label='Original Data')

# Plotting the new test samples with larger size, distinct marker, and contrasting colormap
plt.scatter(new_samples_df['Height (cm)'], new_samples_df['Weight (kg)'], c=new_samples_df['Predicted Cluster'], cmap='coolwarm', marker='D', s=100, edgecolors='black', label='Test Samples')

plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title('Original Data Clusters vs Test Samples')
plt.legend()
plt.grid(True)
plt.show()