## LOAD NECESSARY LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import kagglehub

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error,
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All libraries imported successfully")

KeyboardInterrupt: 

## LOAD THE DATASET AND EYEBALL(DISPLAY) IT

In [None]:
path = kagglehub.dataset_download('imakash3011/customer-personality-analysis')
path

Using Colab cache for faster access to the 'customer-personality-analysis' dataset.


'/kaggle/input/customer-personality-analysis'

In [None]:
import os
os.listdir(path)

file = os.path.join(path, 'marketing_campaign.csv')
df = pd.read_csv(file, sep='\t')
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,...,3,0,0,0,0,0,0,3,11,0


## Context
**Problem Statement**

Customer Personality Analysis is a detailed analysis of a company’s ideal customers. It helps a business to better understand its customers and makes it easier for them to modify products according to the specific needs, behaviors and concerns of different types of customers.

Customer personality analysis helps a business to modify its product based on its target customers from different types of customer segments. For example, instead of spending money to market a new product to every customer in the company’s database, a company can analyze which customer segment is most likely to buy the product and then market the product only on that particular segment.

Content
Attributes

**People**

ID: Customer's unique identifier

Year_Birth: Customer's birth year

Education: Customer's education level

Marital_Status: Customer's marital status

Income: Customer's yearly household income

Kidhome: Number of children in customer's household

Teenhome: Number of teenagers in customer's household

Dt_Customer: Date of customer's enrollment with the company

Recency: Number of days since customer's last purchase

Complain: 1 if the customer complained in the last 2 years, 0 otherwise



**Products**

MntWines: Amount spent on wine in last 2 years

MntFruits: Amount spent on fruits in last 2 years

MntMeatProducts: Amount spent on meat in last 2 years

MntFishProducts: Amount spent on fish in last 2 years

MntSweetProducts: Amount spent on sweets in last 2 years

MntGoldProds: Amount spent on gold in last 2 years



**Promotion**

NumDealsPurchases: Number of purchases made with a discount

AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise

AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise

AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise

AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise

AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise

Response: 1 if customer accepted the offer in the last campaign, 0 otherwise




**Place**

NumWebPurchases: Number of purchases made through the company’s website

NumCatalogPurchases: Number of purchases made using a catalogue

NumStorePurchases: Number of purchases made directly in stores

NumWebVisitsMonth: Number of visits to company’s website in the last month

**Target**
Need to perform clustering to summarize customer segments.

## DATA PREPROCESSING

In [None]:
# DATA PREPROCESSING

# Check for missing values
print("\nMissing Values:")
missing_vals = df.isnull().sum()
print(missing_vals[missing_vals > 0])

# Handle missing values in Income
if df['Income'].isnull().sum() > 0:
    print(f"\nFilling {df['Income'].isnull().sum()} missing Income values with median...")
    df['Income'].fillna(df['Income'].median(), inplace=True)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicate rows")

Handling the missing values and if any are found it sums them up and displays them. Missing data can break models or give incorrect results.

If missing values are found in Income, median is used for filling because its resistant to outliers.

Duplicates are removed to avoid counting the same customer twice.

In [None]:
# Create new features
print("\nCreating derived features...")

# Calculate Age from Year_Birth
current_year = 2014  # Based on the dataset's latest date
df['Age'] = current_year - df['Year_Birth']

# Total children
df['Total_Children'] = df['Kidhome'] + df['Teenhome']

# Total spending
spending_cols = ['MntWines', 'MntFruits', 'MntMeatProducts',
                 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
df['Total_Spending'] = df[spending_cols].sum(axis=1)

# Total purchases
purchase_cols = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
df['Total_Purchases'] = df[purchase_cols].sum(axis=1)

# Total campaigns accepted (Identifies responsive customers for targeting)
campaign_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
                 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
df['Total_Campaigns_Accepted'] = df[campaign_cols].sum(axis=1)

# Average spending per purchase
df['Avg_Spending_Per_Purchase'] = df['Total_Spending'] / (df['Total_Purchases'] + 1)

# Customer tenure (days since enrollment)
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')
reference_date = df['Dt_Customer'].max()
df['Customer_Tenure_Days'] = (reference_date - df['Dt_Customer']).dt.days

# Family status
df['Has_Children'] = (df['Total_Children'] > 0).astype(int)

print("✓ Created derived features:")
print("  - Age, Total_Children, Total_Spending, Total_Purchases")
print("  - Total_Campaigns_Accepted, Avg_Spending_Per_Purchase")
print("  - Customer_Tenure_Days, Has_Children")

Involves creating new useful variables. 'axis=1' m
eans sum across columns for each row.

In [None]:
# Remove outliers from Income
Q1 = df['Income'].quantile(0.25) # 25th percentile
Q3 = df['Income'].quantile(0.75) # 75th percentile
IQR = Q3 - Q1
income_before = len(df)
df = df[(df['Income'] >= Q1 - 1.5*IQR) & (df['Income'] <= Q3 + 1.5*IQR)]
print(f"Removed {income_before - len(df)} income outliers")

print(f"\n✓ Final dataset shape: {df.shape}")

Uses the IQR method to remove extreme income values

## EXPLORATORY DATA ANALYSIS

Data is visualized to understand patterns before building models

In [None]:
# Distribution of Total Spending
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].hist(df['Total_Spending'], bins=40, edgecolor='black', color='skyblue')
axes[0, 0].set_xlabel('Total Spending ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Total Customer Spending')

# Box plot that summarizes the distribution with quartiles
axes[0, 1].boxplot(df['Total_Spending'])
axes[0, 1].set_ylabel('Total Spending ($)')
axes[0, 1].set_title('Total Spending - Box Plot')

axes[1, 0].hist(df['Income'], bins=40, edgecolor='black', color='lightcoral')
axes[1, 0].set_xlabel('Income ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Customer Income')

# Scatter Plot (relationship between income and spending)
axes[1, 1].scatter(df['Income'], df['Total_Spending'], alpha=0.5, color='green')
axes[1, 1].set_xlabel('Income ($)')
axes[1, 1].set_ylabel('Total Spending ($)')
axes[1, 1].set_title('Income vs Total Spending')

plt.tight_layout()
plt.savefig('spending_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Spending by Product Category
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
colors = ['red', 'green', 'brown', 'blue', 'pink', 'gold']

for idx, (col, color) in enumerate(zip(spending_cols, colors)):
    row = idx // 3
    col_idx = idx % 3
    axes[row, col_idx].hist(df[col], bins=30, edgecolor='black', color=color, alpha=0.7)
    axes[row, col_idx].set_xlabel(f'{col.replace("Mnt", "")} Spending ($)')
    axes[row, col_idx].set_ylabel('Frequency')
    axes[row, col_idx].set_title(f'Distribution of {col.replace("Mnt", "")} Spending')

plt.tight_layout()
plt.savefig('category_spending.png', dpi=300, bbox_inches='tight')
plt.show()

# Average spending by category
avg_spending = df[spending_cols].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
avg_spending.plot(kind='barh', color='teal')
plt.xlabel('Average Spending ($)')
plt.title('Average Spending by Product Category')
plt.tight_layout()
plt.savefig('avg_spending_by_category.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Demographic analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age distribution
axes[0, 0].hist(df['Age'], bins=30, edgecolor='black', color='lightgreen')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Age Distribution of Customers')

# Education level
education_counts = df['Education'].value_counts()
axes[0, 1].bar(education_counts.index, education_counts.values, color='orange')
axes[0, 1].set_xlabel('Education Level')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Distribution by Education Level')
axes[0, 1].tick_params(axis='x', rotation=45)

# Marital Status
marital_counts = df['Marital_Status'].value_counts()
axes[1, 0].pie(marital_counts.values, labels=marital_counts.index, autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Distribution by Marital Status')

# Children distribution
children_spending = df.groupby('Total_Children')['Total_Spending'].mean()
axes[1, 1].bar(children_spending.index, children_spending.values, color='purple')
axes[1, 1].set_xlabel('Number of Children')
axes[1, 1].set_ylabel('Average Spending ($)')
axes[1, 1].set_title('Average Spending by Number of Children')

plt.tight_layout()
plt.savefig('demographics_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Purchase Channel Analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Purchase by channel
purchase_channels = df[purchase_cols].mean()
axes[0].bar(['Web', 'Catalog', 'Store'], purchase_channels.values,
            color=['steelblue', 'coral', 'lightgreen'])
axes[0].set_ylabel('Average Number of Purchases')
axes[0].set_title('Average Purchases by Channel')

# Campaign acceptance rate (shows which marketing campaigns were the most successful)
campaign_acceptance = df[campaign_cols].sum()
axes[1].bar(['Camp1', 'Camp2', 'Camp3', 'Camp4', 'Camp5', 'Last'],
            campaign_acceptance.values, color='purple')
axes[1].set_ylabel('Total Acceptances')
axes[1].set_title('Marketing Campaign Acceptance Rates')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('purchase_campaign_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation Analysis
print("\nCreating correlation heatmap...")
numerical_cols = ['Age', 'Income', 'Total_Children', 'Recency',
                  'Total_Spending', 'Total_Purchases', 'NumWebVisitsMonth',
                  'Total_Campaigns_Accepted', 'Customer_Tenure_Days']

plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            fmt='.2f', square=True, linewidths=1)
plt.title('Correlation Heatmap of Key Features', fontsize=14, pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## FEATURE ENGINEERING

Involves creating a duplicate of the dataframe and converting text to numbers

In [None]:
# Create a copy for modeling
df_model = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Education', 'Marital_Status']

print(f"\nEncoding categorical features...")
for col in categorical_cols:
    le = LabelEncoder()
    df_model[col + '_encoded'] = le.fit_transform(df_model[col])
    label_encoders[col] = le
    print(f"  ✓ {col}: {list(le.classes_)}")

# Select variables(X) for predicting the target variable(Y) (excluding ID and date columns)
feature_cols = [
    'Age', 'Income', 'Total_Children', 'Recency', 'Total_Purchases',
    'NumWebVisitsMonth', 'NumDealsPurchases', 'Customer_Tenure_Days',
    'Education_encoded', 'Marital_Status_encoded', 'Has_Children',
    'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
    'MntSweetProducts', 'MntGoldProds'
]

print(f"\n✓ Feature engineering completed")
print(f"Total features for modeling: {len(feature_cols)}")