Import libraries

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Load dataset

In [27]:
df = pd.read_csv("marketing_campaign.csv", sep="\t")

Column Renaming

In [28]:
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

Handle Missing Values

In [29]:
df['income'] = df.groupby('education')['income'].transform(lambda x: x.fillna(x.median()))

Remove Duplicates

In [30]:
df = df.drop_duplicates()

Standardize Categorical Columns

In [31]:
df['education'] = df['education'].str.strip().str.title()

df['marital_status'] = df['marital_status'].replace({
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single',
    'Widow': 'Widowed',
    'Together': 'Married'
}).str.strip().str.title()

Convert Date & Add Customer Tenure

In [32]:
df['dt_customer'] = pd.to_datetime(df['dt_customer'], format="%d-%m-%Y")
df['customer_tenure_days'] = (pd.to_datetime("today") - df['dt_customer']).dt.days

In [33]:
df['total_spent'] = df[['mntwines', 'mntfruits', 'mntmeatproducts',
                        'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum(axis=1)

df['avg_spend_per_child'] = df['total_spent'] / (df['kidhome'] + df['teenhome'] + 1)

df['total_cmp_offers'] = df[['acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3',
                             'acceptedcmp4', 'acceptedcmp5']].sum(axis=1)

df['campaign_success_rate'] = df['total_cmp_offers'] / 5

In [34]:
Q1 = df['income'].quantile(0.25)
Q3 = df['income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df['income'] = np.where(df['income'] > upper_bound, upper_bound,
                        np.where(df['income'] < lower_bound, lower_bound, df['income']))

In [35]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['income'])
plt.title("Boxplot: Income after Outlier Treatment")
plt.savefig("income_boxplot.png")
plt.close()

In [36]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.savefig("missing_values_heatmap.png")
plt.close()

In [37]:
df.to_csv("cleaned_customer_data.csv", index=False)