In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Load data
df = pd.read_csv(r'C:\Users\tyler\OneDrive\Desktop\GitHub Repositories\DataScience2\Exit_Tickets\March26th\marketingData.csv')

# Handle missing values
df['Income'].fillna(df['Income'].median(), inplace=True)

# Convert categorical variables to numerical with Label Encoding
categorical_columns = ['Education', 'Marital_Status']
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

# Normalize numerical variables
scaler = StandardScaler()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Feature Selection: Include both numerical and categorical features
features = list(numerical_columns) + categorical_columns
X = df[features]

# Determine the best number of components based on silhouette score
n_components_range = range(2, 100)
best_n_components = 2
best_silhouette_score = -1

for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, random_state=0)
    gmm.fit(X)
    labels = gmm.predict(X)
    silhouette_avg = silhouette_score(X, labels)

    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_n_components = n_components

# Print the best number of clusters and rationale
print(f"The best number of clusters (k) found is: {best_n_components}.")
print(f"This value is considered the best because it achieved the highest silhouette score of {best_silhouette_score:.2f}.")

# Fit the GMM with the best number of components
gmm = GaussianMixture(n_components=best_n_components, random_state=0)
gmm.fit(X)
labels = gmm.predict(X)
df['Segment'] = labels

# Create a copy of the data for Segment 0
segment_0 = df[df['Segment'] == 0].copy()

# Revert scaling for more interpretable results
segment_0[numerical_columns] = scaler.inverse_transform(segment_0[numerical_columns])

# Translate the label encoded features back to original
for column, le in label_encoders.items():
    segment_0[column] = le.inverse_transform(segment_0[column].astype(int))

# Description of Segment 0
description = segment_0.describe(include='all')

# Print statement for describing the characteristics of Segment 0
print(f"Characteristics of Customer Segment 0:\n"
      f"- Average Year of Birth: {description.at['mean', 'Year_Birth']:.2f}\n"
      f"- Average Income: {description.at['mean', 'Income']:.2f}\n"
      f"- Average Amount Spent on Wines: {description.at['mean', 'MntWines']:.2f}\n"
      f"- Average Number of Deals Purchases: {description.at['mean', 'NumDealsPurchases']:.2f}\n"
      f"- Average Number of Web Purchases: {description.at['mean', 'NumWebPurchases']:.2f}\n"
      f"- Education Level (most common): {segment_0['Education'].mode()[0]}\n"
      f"- Marital Status (most common): {segment_0['Marital_Status'].mode()[0]}\n"
      f"- Proportion accepting Campaign 1: {description.at['mean', 'AcceptedCmp1']:.2f}")


The best number of clusters (k) found is: 2.
This value is considered the best because it achieved the highest silhouette score of 0.23.
Characteristics of Customer Segment 0:
- Average Year of Birth: 1969.36
- Average Income: 42384.96
- Average Amount Spent on Wines: 155.84
- Average Number of Deals Purchases: 2.56
- Average Number of Web Purchases: 3.57
- Education Level (most common): Graduation
- Marital Status (most common): Married
- Proportion accepting Campaign 1: 0.00
