In [None]:
#Importing the Libraries

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file

In [None]:
data=pd.read_csv('/content/drive/MyDrive/cognorise/Data analytics/marketing_campaign.csv',sep="\t")
data

In [None]:
# Display the first few rows of the dataset
print(data.head())

# Display summary statistics of the dataset
print(data.describe())



In [None]:
#get the number of rows and columns
print(f"The number of rows : {data.shape[0]} and the number of columns : {data.shape[1]}")

pd.set_option("display.max_columns",None)

data.head()

In [None]:
#info on features
data.info()


In [None]:
# to remove the N/A values
data_cleaned = data.dropna()
print("The total number of data-points after removing the rows with missing values are:", len(data_cleaned))

In [None]:
# Check for missing values
print(data.isnull().sum())


In [None]:
data.dtypes

In [None]:
print("Total categories in the feature Marital_Status:\n", data_cleaned["Marital_Status"].value_counts(), "\n")
print("Total categories in the feature Education:\n", data_cleaned["Education"].value_counts())

In [None]:
data.describe()

In [None]:
data = data[(data["Income"]<600000)]
print("The total number of data-points after removing the outliers are:", len(data))

**Feature Engineering**

Feature Engineering for Enhanced Data Insight

To gain deeper insights from our dataset, we will engineer new features based on existing ones. Below are the enhancements we plan to implement:

1:Age Calculation: Derive the "Age" of a customer from the "Year_Birth" to better understand the demographic distribution.

2:Total Spending: Introduce a new feature "Spent" that aggregates the total expenditures of a customer across all subcategories, providing a holistic view of customer spending behavior.

3:Living Situation: Generate a "Living_With" feature from "Marital_Status" to categorize customers as living with a "Partner" or living "Alone," which aids in understanding household dynamics.



In [None]:
# Calculate the age
data['Age'] = 2021 - data['Year_Birth']

#Total spendings on various items
data["Spent"] = data["MntWines"]+ data["MntFruits"]+ data["MntMeatProducts"]+ data["MntFishProducts"]+ data["MntSweetProducts"]+ data["MntGoldProds"]

#Deriving living situation by marital status
data["Living_With"] = data["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone",})
data["Living_With"] = data["Living_With"].replace({"Alone": 1, "Partner":2})
#Feature indicating total children living in the household
data["Children"]=data["Kidhome"]+data["Teenhome"]



4:Household Children Count: Create a "Children" feature to count the total number of children in a household, encompassing both kids and teenagers.

5:Household Size: Establish a "Family_Size" feature that sums "Living_With" and "Children" to provide a clearer picture of the household's composition.

6:Simplify Education Levels: Simplify the "Education" field into three broad categories to streamline analysis and reduce complexity.

7:Feature Reduction: Drop redundant features that will not be utilized in the modeling process to streamline the dataset and focus on the most impactful variables.

These transformations are designed to enhance our analytical capabilities and improve the effectiveness of our subsequent modeling efforts.

In [None]:
#Feature for total members in the householde
data["Family_Size"] = data["Living_With"] + data["Children"]

#Segmenting education levels in three groups
data["Education"]=data["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})
#For clarity
data=data.rename(columns={"MntWines": "Wines","MntFruits":"Fruits","MntMeatProducts":"Meat","MntFishProducts":"Fish","MntSweetProducts":"Sweets","MntGoldProds":"Gold"})

#Dropping some of the redundant features
to_drop = ["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "ID", "Year_Birth"]
data = data.drop(to_drop, axis=1)

In [None]:
data.info()

In [None]:
data[['Age','Living_With', 'Children', 'Family_Size', 'Income', 'Spent', 'Complain']].describe().T

**Exploratory Data Analysis**

**Univariate Analysis**

Univariate analysis involves examining the distribution and characteristics of a single variable in a dataset, often through statistical summaries and visualizations like histograms or bar plots. By plotting the frequency of each feature, we can gain insights into the central tendency (like mean or median), dispersion (such as range and variance), and the presence of outliers or skewness in the data. This analysis helps identify patterns, anomalies, or typical values within each feature, which are crucial for understanding the overall data structure and informing further analysis or preprocessing steps.

In [None]:
continuous_columns = ['Age', 'Income', 'Spent']
categorical_columns = ['Living_With', 'Children', 'Family_Size', 'Complain', 'Education']
all_columns = continuous_columns + categorical_columns

# Calculate the number of rows needed for two columns
num_rows = (len(all_columns) + 1) // 2

# Setup the matplotlib figure and axes
fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(10, num_rows * 3))
axes = axes.flatten()  # Flatten the axes array to make indexing easier

# Iterate over the columns and create appropriate plots
for i, col in enumerate(all_columns):
    if col in continuous_columns:
        sns.histplot(data=data, x=col, bins=20, ax=axes[i], kde=True, color='blue')  # Add density curve for continuous data
        axes[i].set_title(f'Histogram of {col}')
    elif col in categorical_columns:
        sns.countplot(data=data, x=col, ax=axes[i], color='blue')
        axes[i].set_title(f'Count Plot of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    # If the number of columns is odd, hide the last subplot (if unused)
if len(all_columns) % 2 != 0:
    axes[-1].set_visible(False)  # Hide the last axis if not needed

# Adjust layout
plt.tight_layout()
plt.show()


**Bivariate Analysis (features correlation)**

Before proceed with the Data Preprocessing and the Machine Learning step, let's visualize the some data correlation and trends.

Income x Total Spent:

In [None]:
# Create the scatter plot with a regression line
plt.figure(figsize=(8, 4))
sns.regplot(x='Income', y='Spent', data=data, scatter_kws={'s': 20}, line_kws={'color': 'red'})

plt.title('Scatter Plot of Income vs. Spent with Trend Line')
plt.xlabel('Income')
plt.ylabel('Spent')
plt.show()

Income and Spent are two numerical features strongly correlated. There are some outliers, but I believe it is not necessary to drop them or perform an imputation technique.

**Average Spent x Age groups:**

In [None]:
# Create bins for the 'Age' column
bins = np.arange(data['Age'].min(), data['Age'].max() + 10, 10)  # Adjust the bin width as necessary
bin_centers = 0.5 * (bins[:-1] + bins[1:])  # Calculate bin centers

data['Age_bins'] = pd.cut(data['Age'], bins=bins, include_lowest=True)

# Group by the bins and calculate the mean of 'Spent'
grouped = data.groupby(pd.cut(data['Age'], bins=bins, labels=bin_centers))['Spent'].mean().reset_index()
grouped.columns = ['Age_bins', 'Spent']

# Plotting
plt.figure(figsize=(10, 4))
sns.histplot(data=data, x='Age', bins=bins, color='lightblue', kde=False, stat='count', label='Age Frequency')

# Create a secondary y-axis for the line plot
ax2 = plt.twinx()
lineplot = sns.lineplot(data=grouped, x='Age_bins', y='Spent', ax=ax2, color='darkblue', marker='o', label='Average Spent')

# Setting labels and title
plt.title('Histogram of Age with Average Spent Overlay')
plt.xlabel('Age')
ax2.set_ylabel('Average Spent')
plt.ylabel('Frequency')

# Adding text labels at each point
for x, y in zip(grouped['Age_bins'], grouped['Spent']):
    ax2.text(x, y, f'{y:.2f}', color='darkblue', ha='right', size=10)

# Handling legends
handles, labels = [], []
for ax in plt.gcf().axes:
    for h, l in zip(*ax.get_legend_handles_labels()):
        handles.append(h)
        labels.append(l)
plt.legend(handles, labels, loc='center right')

plt.show()

**Average Spent x Education and Family_Size**

In [None]:
# Set up the matplotlib figure
plt.figure(figsize=(14, 6))

# First subplot: Average "Spent" by "Education"
plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
sns.barplot(x='Education', y='Spent', data=data)
plt.title('Average Spent by Education')
plt.xlabel('Education')
plt.ylabel('Average Spent')

# Second subplot: Average "Spent" by "Children"
plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
sns.barplot(x='Family_Size', y='Spent', data=data)
plt.title('Average Spent by Family Size')
plt.xlabel('Family Size')
plt.ylabel('Average Spent')
# Adjust layout
plt.tight_layout()

plt.show()


We can observe that Postgraduate customers has the highest average Spent. Single (living alone) customers has the highest average Spent.

**Places of Purchase:**

In [None]:
totals = {
    'Purchase Type': ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumDealsPurchases'],
    'Total Purchases': [
        data['NumWebPurchases'].sum(),
        data['NumCatalogPurchases'].sum(),
        data['NumStorePurchases'].sum(),
        data['NumDealsPurchases'].sum()
    ]
}

df_totals = pd.DataFrame(totals)

# Create the bar chart
plt.figure(figsize=(8, 3))
sns.barplot(x='Purchase Type', y='Total Purchases', data=df_totals)
plt.title('Total Purchases by Type')

plt.ylabel('Total Number of Purchases')
plt.xticks(rotation=45)  # Rotates labels to avoid overlap
plt.show()

In-store purchasing is the most frequent type of purchase among all consumers

**Analyzing the Campaigns performance:**

In [None]:
# Calculate the count of acceptances for each campaign
campaigns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
counts = data[campaigns].sum().reset_index()
counts.columns = ['Campaign', 'Count of Acceptances']

# Create the bar chart
plt.figure(figsize=(6, 3))
sns.barplot(x='Campaign', y='Count of Acceptances', data=counts)
plt.title('Count of Customers Accepting Each Campaign')
plt.xlabel('Campaign')
plt.ylabel('Count of Acceptances')
plt.xticks(rotation=45)
plt.show()

The acceptance and purchase rates for the campaigns are low, with less than 10% of total customers participating. However, campaigns 3, 4, and 5 saw higher acceptance rates compared to campaigns 1 and 2, with campaign 2 performing the worst.

**Data Preprocessing**

We have to perform feature engineering to prepare the data for machine learning. The step includes:


*   Label encoding the categorical features.

*   Scaling the features using the standard scaler.
*   Create a subset dataframe for dimensionality reduction using PCA.


In [None]:
#Get list of categorical variables
cat = (data.dtypes == 'object')
object_cols = list(cat[cat].index)

print("Categorical variables in the dataset:", object_cols)

In [None]:
# Creating a dictionary for manual label encoding
education_mapping = {
    "Undergraduate": 0,
    "Graduate": 1,
    "Postgraduate": 2
}

# Apply the mapping to the 'Education' column
data['Education'] = data['Education'].map(education_mapping)

In [None]:
#Label Encoding the object dtypes.
LE=LabelEncoder()
for i in object_cols:
    data[i]=data[[i]].apply(LE.fit_transform)

In [None]:
print(data["Living_With"].value_counts())
print(data["Education"].value_counts())

Now, we have all the data types as numeric.

In [None]:
#Creating a copy of data
ds = data.copy()
# creating a subset of dataframe by dropping the features on deals accepted and promotions
cols_del = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4','AcceptedCmp5', 'Complain', 'Response', 'Age_bins']
ds = ds.drop(cols_del, axis=1)
#Scaling
scaler = StandardScaler()
scaler.fit(ds)
scaled_ds = pd.DataFrame(scaler.transform(ds),columns= ds.columns)
print("All features are now scaled")

In [None]:
scaled_ds.head(5)

**Dimensionality Reduction**
Dimensionality reduction is a critical preprocessing step before employing KMeans clustering for several reasons:

***Improving Clustering Performance:** KMeans clustering calculates distances between points to form clusters. In high-dimensional spaces, distance metrics can become less meaningful—a phenomenon known as the "curse of dimensionality." Reducing the number of dimensions helps mitigate this issue, making the distance metric more reliable and the clustering results more meaningful.

***Enhanced Visualization:** Visualization is a powerful
tool for interpreting clustering results. However, visualizing high-dimensional data directly is not feasible. Reducing the data to two or three dimensions allows for effective visualization, enabling easier analysis and communication of the clustering outcomes.

***Feature Selection and Noise Reduction:** Dimensionality reduction techniques like Principal Component Analysis (PCA) not only reduce the data dimensions but also help in identifying the most significant variables. This can be viewed as a form of feature selection, where only the features that contribute most to the variance in the data are retained. By focusing on these key features, KMeans can produce more distinct and interpretable clusters.

**Dimensionality reduction with Principal Component Analysis (PCA)**

PCA is a statistical technique used in data analysis to emphasize variation and bring out strong patterns in a dataset. PCA helps in identifying correlations and patterns in data that are not easily identified in raw data. The technique transforms the original variables into a new set of variables, which are called principal components. These principal components are orthogonal (meaning they are uncorrelated), and they are ordered so the first few retain most of the variation present in all of the original variables. PCA is widely used for dimensionality reduction in machine learning and data visualization.

The following PCA steps efficiently reduces the dimensionality of the data while attempting to retain the most significant variance present in the original dataset, which is often crucial for visualization, noise reduction, and speeding up further analysis.

The PCA object is initialized with n_components=3. This specifies that the PCA transformation will reduce the dimensionality of the dataset to three principal components.

In [None]:
#Initiating PCA to reduce dimentions to 3
pca = PCA(n_components=3)

# Fitting the PCA Model:
pca.fit(scaled_ds)

# Transforming the Data and Creating a DataFrame:
PCA_ds = pd.DataFrame(pca.transform(scaled_ds), columns=(["col1","col2", "col3"]))

# Descriptive statistics
PCA_ds.describe().T

In [None]:
#A 3D Projection Of Data In The Reduced Dimension
x =PCA_ds["col1"]
y =PCA_ds["col2"]
z =PCA_ds["col3"]
#To plot
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()

**Clustering - Customer Segmentation**

The steps involved in the Clustering are:

*Elbow Method to determine the optimum number of clusters

*Employ the KMeans (Clustering)

*Examining the clusters

In [None]:
# Quick examination of elbow method to find numbers of clusters to make.
print('Elbow Method to determine the number of clusters to be formed:')
Elbow_M = KElbowVisualizer(KMeans(random_state=42), k=10)
Elbow_M.fit(PCA_ds)
Elbow_M.show()

The visual above indicates that four is the optimum number of clusters for the data.

In [None]:
# Initiating the KMeans Clustering model
kmeans = KMeans(n_clusters=4, random_state=42)
# Fit model and predict clusters
cluster_labels = kmeans.fit_predict(PCA_ds)  # Changed variable name from yhat_kmeans to cluster_labels
PCA_ds["Clusters"] = cluster_labels
# Adding the Clusters feature to the original dataframe.
data["Clusters"] = cluster_labels
scaled_ds["Clusters"] = cluster_labels

To examine the clusters, we can plot the PCA_ds data points in a 3-D space.

In [None]:
# Create a new figure for the 3D plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

# Extract columns for the plot
x = PCA_ds['col1']
y = PCA_ds['col2']
z = PCA_ds['col3']

# Get unique clusters
clusters = PCA_ds['Clusters'].unique()

# Color map for clusters
colors = plt.cm.jet(np.linspace(0, 1, len(clusters)))

# Plot each cluster
for cluster, color in zip(clusters, colors):
    ix = PCA_ds['Clusters'] == cluster
    ax.scatter(x[ix], y[ix], z[ix], c=[color], label=f'Cluster {cluster}', s=50)

ax.set_title('3D Scatter Plot of PCA Results')
ax.legend()
plt.show()

It seems that the clusters are well defined, segmenting the dataset into four clusters.

**Evaluating the Model**

As we are working with unsupervised clustering, we do not have labeled feature to directly evaluate or score our model. Instead, the focus is to examine the patterns within the formed clusters and understand their characteristics.

To achieve this, we can conduct exploratory data analysis to visualize the data across different clusters and derive insights, enabling us to draw meaningful conclusions about the underlying cluster patterns.

Note that we can evaluate the clusters associated with both the scaled and the original data, as they correspond to the same rows.



In [None]:
# Create a bar plot for the number of customers in each cluster
plt.figure(figsize=(6, 3))
sns.countplot(x='Clusters', data=data)
plt.title('Number of Customers in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Number of Customers')
plt.show()

The clusters are well distributed among customers.

In [None]:
#scatter plot
plt.figure(figsize=(8, 4))
sns.scatterplot(x='Income', y='Spent', hue='Clusters', data=data, palette='viridis', s=20)

plt.title('Scatter Plot of Income vs. Spent by Cluster')
plt.xlabel('Income')
plt.ylabel('Spent')
plt.legend(title='Cluster')
plt.show()

We can notice that:

  *Clusters 1 and 3 have both low speding and low income.

  *Cluster 0 has average spending and average income.

  *Cluster 2 has high speding and high income.

**Radar Chart**

Radar charts, also known as spider charts, effectively visualize multivariate data, facilitating the comparison of features across different groups. However, it's crucial to note that radar charts can be misleading if there is significant variation in the scales of the variables. Therefore, scaling the data appropriately is essential for accurate and meaningful visualizations.



In [None]:
from math import pi

# Group by 'Clusters' and calculate the mean for 'Income', 'Spent', and 'Family_Size'
attributes = ['Income', 'Spent', 'Family_Size', 'Education', 'Age']
cluster_means = scaled_ds.groupby('Clusters')[attributes].mean().reset_index()

# Number of variables we're plotting.
num_vars = len(attributes)

# Compute angle each bar is centered on:
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]  # Complete the loop

# Create a radar chart for each cluster
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

for index, row in cluster_means.iterrows():
    values = row[attributes].values.flatten().tolist()
    values += values[:1]  # Complete the loop
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {int(row["Clusters"])}')
    ax.fill(angles, values, alpha=0.1)

# Labels for each attribute.
ax.set_xticks(angles[:-1])
ax.set_xticklabels(attributes)

# Add legend and title
plt.title('Average Income, Spent, Family Size, and Education by Cluster')
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()


We can make the following observations:

**Cluster 0:** This cluster represents a middle group with moderate income and spending, and average family size. It contains the oldest demographic among all clusters.

**Cluster 1:** This group exhibits low income and low spending levels, yet it has the largest family size. It predominantly consists of older customers.

**Cluster 2:** This group boasts the highest income and spending, typically lives alone, and compromises the midle age group (40-50 years old)

**Cluster 3:** Characterized by the lowest income and spending of all groups, this cluster has an average family size and is primarily made up of the youngest customers.

In [None]:
attributes = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumDealsPurchases']
cluster_means = scaled_ds.groupby('Clusters')[attributes].mean().reset_index()

# Number of variables we're plotting.
num_vars = len(attributes)

# Compute angle each bar is centered on:
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]  # Complete the loop

# Create a radar chart for each cluster
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

for index, row in cluster_means.iterrows():
    values = row[attributes].values.flatten().tolist()
    values += values[:1]  # Complete the loop
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {int(row["Clusters"])}')
    ax.fill(angles, values, alpha=0.1)

# Labels for each attribute.
ax.set_xticks(angles[:-1])
ax.set_xticklabels(attributes)

# Add legend and title
plt.title('Average Income, Spent, Family Size, and Education by Cluster')
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()

In [None]:
# Group by 'Clusters' and calculate the mean for ...
attributes = ['Wines', 'Fruits', 'Meat', 'Fish', 'Sweets', 'Gold']
cluster_means = scaled_ds.groupby('Clusters')[attributes].mean().reset_index()

# Number of variables we're plotting.
num_vars = len(attributes)

# Compute angle each bar is centered on:
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]  # Complete the loop

# Create a radar chart for each cluster
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

for index, row in cluster_means.iterrows():
    values = row[attributes].values.flatten().tolist()
    values += values[:1]  # Complete the loop
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {int(row["Clusters"])}')
    ax.fill(angles, values, alpha=0.1)

# Labels for each attribute.
ax.set_xticks(angles[:-1])
ax.set_xticklabels(attributes)

# Add legend and title
plt.title('Radar Chart: Average Spending with Wines, Fruits, Meat, Fish, Sweets, and Gold by Cluster')
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()


**Exploring the campaings performance**

It is worth to recall that the campaings attributes were not used to cluster the customers, but its analysis can also give some insights regarding how was the campaings performance in each group (cluster).

In [None]:
# Group by 'Clusters' and calculate the mean for ...
attributes = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
cluster_means = data.groupby('Clusters')[attributes].sum().reset_index()

# Number of variables we're plotting.
num_vars = len(attributes)

# Compute angle each bar is centered on:
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]  # Complete the loop

# Create a radar chart for each cluster
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

for index, row in cluster_means.iterrows():
    values = row[attributes].values.flatten().tolist()
    values += values[:1]  # Complete the loop
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {int(row["Clusters"])}')
    ax.fill(angles, values, alpha=0.1)

# Labels for each attribute.
ax.set_xticks(angles[:-1])
ax.set_xticklabels(attributes)

# Add legend and title
plt.title('Radar Chart: Total Accpeted Promotions by Campaing for each Cluster')
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()


**Conclusion**

In this project, we have meticulously analyzed a Customer Dataset through the creation of customer segmentation. Our workflow encompassed data cleaning, extensive feature engineering, outlier management, and exploratory data analysis enhanced by visualizations. We then proceeded with data preprocessing, which included label encoding for categorical features, scaling of all features, and dimensionality reduction via PCA.

During the Machine Learning phase, we utilized the Elbow method to determine the optimal number of clusters (K) for the KMeans algorithm. After fitting the model, we conducted a thorough analysis of the resulting customer segments to extract distinct profiles for each cluster, revealing insightful patterns in consumer behavior.




Insights on Cluster Profiles:

***Cluster 0:**

*Exhibits high to average income and spending.
*Typically consists of families with 3 to 4 members.
*Represents the oldest age demographic.
*Shows significant expenditure on wines and gold.
*Purchases are well-distributed across catalog, store, and web channels.
*Predominantly responds to campaign 4.

***Cluster 1:**

*Characterized by low income and spending.
*Contains the largest families, with 4 to 5 members.
*Generally older in age.
*Purchases primarily through deals.
*Shows a low rate of campaign acceptance, with campaign 3 being the most accepted.

***Cluster 2:**

*Features the highest income and spending.
*Primarily consists of singles.
*Age range predominantly between 40 and 50 years.
*Favors purchases from catalogs.
*Shows a high acceptance rate for campaigns 1 and 5.

Cluster 3:

Noted for the lowest income and spending.
Comprises families of 2 or 3 members.
Includes the youngest demographic.
Mostly undergraduate or graduate education levels.
Frequently purchases through deals.
Exhibits the lowest campaign acceptance, with campaign 3 being notably accepted.

**Future Recommendations**

To enhance the robustness and applicability of our findings, future work could explore the integration of additional predictive modeling techniques and the application of advanced algorithms for dynamic segmentation. Employing time series analysis to track changes in customer behavior over time could provide deeper insights into trends and lifecycle patterns. Additionally, conducting A/B testing on selected campaigns within identified segments may offer concrete data on the effectiveness of targeted marketing strategies, thereby maximizing customer engagement and optimizing marketing spend.

This thoughtful approach not only underscores the detailed analysis undertaken but also highlights potential avenues for enriching the insights gleaned from customer data.

In [None]:
# prompt: save this code to local directory

# Save the DataFrame to a CSV file
df.to_csv('marcketcampaign.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('marcketcampaign.csv')