In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn_extra.cluster import KMedoids  

# Load the data from the CSV file
data = pd.read_csv('Womens.csv')

# Define the target variable
# target_column = "Age"

# Select the relevant features (X)
X = data[['Age', 'Rating', 'Positive Feedback Count']]

# Split the data into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Create a transformer for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Age', 'Rating', 'Positive Feedback Count'])
    ])

# Apply the preprocessing to both training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Add a customer segmentation step (K-Medoids clustering)
n_clusters = 10
kmedoids = KMedoids(n_clusters=n_clusters, random_state=42)
cluster_labels_train = kmedoids.fit_predict(X_train)  # Get cluster labels for training data
cluster_labels_test = kmedoids.predict(X_test)  # Get cluster labels for testing data

# Add cluster labels to the training and testing data
X_train = pd.DataFrame(X_train, columns=['Age', 'Rating', 'Positive Feedback Count'])
X_train['Cluster'] = cluster_labels_train
X_test = pd.DataFrame(X_test, columns=['Age', 'Rating', 'Positive Feedback Count'])
X_test['Cluster'] = cluster_labels_test

# Print the first few rows of the DataFrames with cluster labels
print(X_train.head())
print(X_test.head())




        Age    Rating  Positive Feedback Count  Cluster
0 -0.991912  0.720737                 -0.44412        1
1 -0.828835  0.720737                 -0.44412        1
2 -0.991912  0.720737                 -0.44412        1
3  1.291168  0.720737                 -0.44412        0
4 -0.339604  0.720737                 -0.44412        2
        Age    Rating  Positive Feedback Count  Cluster
0 -0.747297 -0.178294                 2.852032        8
1  1.128091 -0.178294                -0.444120        5
2  1.046552 -0.178294                 0.076325        5
3 -0.584219 -1.077326                 0.076325        3
4 -1.073451  0.720737                -0.444120        1


In [4]:
from sklearn.metrics import silhouette_score

# Calculate silhouette score for the training data
silhouette_avg = silhouette_score(X_train, cluster_labels_train)
print(f"Silhouette Score for Training Data: {silhouette_avg:.2f}")

# Calculate silhouette score for the testing data
silhouette_avg_test = silhouette_score(X_test, cluster_labels_test)
print(f"Silhouette Score for Testing Data: {silhouette_avg_test:.2f}")



Silhouette Score for Training Data: 0.59
Silhouette Score for Testing Data: 0.60
