## Import packages /data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC


f


In [11]:
# Load data
df = pd.read_csv('../Data/digital_marketing_campaign_dataset.csv')
df.head(10)

Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,AdvertisingPlatform,AdvertisingTool,Conversion
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,IsConfid,ToolConfid,1
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,IsConfid,ToolConfid,1
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,IsConfid,ToolConfid,1
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,IsConfid,ToolConfid,1
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,IsConfid,ToolConfid,1
5,8005,25,Female,42925,Social Media,Awareness,9579.388247,0.153795,0.161316,6,2.12585,7.752831,95,5,8,0,3316,IsConfid,ToolConfid,1
6,8006,38,Female,25615,Referral,Awareness,7302.899852,0.040975,0.060977,42,1.753995,10.698672,54,14,3,6,930,IsConfid,ToolConfid,1
7,8007,56,Female,57083,Social Media,Conversion,5324.283667,0.052878,0.188946,48,2.626015,2.987817,96,9,3,0,2983,IsConfid,ToolConfid,1
8,8008,36,Female,140788,Email,Retention,9421.250951,0.023536,0.112585,13,5.472843,14.287421,73,4,8,5,460,IsConfid,ToolConfid,1
9,8009,40,Male,130764,Social Media,Awareness,6229.193333,0.066641,0.169786,22,1.135665,4.613312,14,8,4,8,3789,IsConfid,ToolConfid,1


## Metrics For engagement

In [None]:
engagement_columns = [
    "WebsiteVisits", 
    "PagesPerVisit", 
    "TimeOnSite", 
    "SocialShares", 
    "EmailOpens", 
    "EmailClicks"
]


Index(['CustomerID', 'Age', 'Gender', 'Income', 'CampaignChannel',
       'CampaignType', 'AdSpend', 'ClickThroughRate', 'ConversionRate',
       'WebsiteVisits', 'PagesPerVisit', 'TimeOnSite', 'SocialShares',
       'EmailOpens', 'EmailClicks', 'PreviousPurchases', 'LoyaltyPoints',
       'AdvertisingPlatform', 'AdvertisingTool', 'Conversion'],
      dtype='object')

In [None]:

# Define features and target variable
engagement_columns = [
    "WebsiteVisits", "PagesPerVisit", "TimeOnSite",
    "SocialShares", "EmailOpens", "EmailClicks"
]
X = df[engagement_columns]  # Engagement features
y = df["Conversion"]  # Target variable (0 or 1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_

# Store feature importance in a dictionary
importance_dict = dict(zip(engagement_columns, feature_importances))

# Print sorted importance scores
importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))
print(importance_dict)


{'PagesPerVisit': 0.22234600270247495, 'TimeOnSite': 0.21863140928368058, 'SocialShares': 0.16185729680437677, 'WebsiteVisits': 0.1599201399408388, 'EmailOpens': 0.13142737417211625, 'EmailClicks': 0.1058177770965127}


In [16]:
# Normalize importance scores to sum to 1
weights = {key: value / sum(importance_dict.values()) for key, value in importance_dict.items()}

# Print normalized weights
print(weights)


{'EmailOpens': 0.22984455944937354, 'EmailClicks': 0.22452075525255535, 'TimeOnSite': 0.21564598740669375, 'PagesPerVisit': 0.16424419033229762, 'WebsiteVisits': 0.1391330288518438, 'SocialShares': 0.02661147870723588}


In [26]:
# Compute the weighted engagement score
#standarise 
from sklearn.preprocessing import StandardScaler

# Standardize the engagement columns
scaler = StandardScaler()
df[engagement_columns] = scaler.fit_transform(df[engagement_columns])

# Calculate the weighted EngagementScore
df["EngagementScore"] = sum(df[col] * weights[col] for col in engagement_columns)

# Display the first few rows
print(df[["CustomerID", "EngagementScore"]].head())



   CustomerID  EngagementScore
0        8000        -0.267986
1        8001        -0.262384
2        8002         0.078242
3        8003         0.049025
4        8004        -0.201495


## Main drivers of engagement

In [27]:
# Drop CustomerID, engagement columns, and ConversionRate
df_cleaned = df.drop(columns=["CustomerID", "ConversionRate","AdvertisingTool","Conversion" ]+ engagement_columns)

# Display the first few rows
print(df_cleaned.head())


   Age  Gender  Income CampaignChannel CampaignType      AdSpend  \
0   56  Female  136912    Social Media    Awareness  6497.870068   
1   69    Male   41760           Email    Retention  3898.668606   
2   46  Female   88456             PPC    Awareness  1546.429596   
3   32  Female   44085             PPC   Conversion   539.525936   
4   60  Female   83964             PPC   Conversion  1678.043573   

   ClickThroughRate  PreviousPurchases  LoyaltyPoints AdvertisingPlatform  \
0          0.043919                  4            688            IsConfid   
1          0.155725                  2           3459            IsConfid   
2          0.277490                  8           2337            IsConfid   
3          0.137611                  0           2463            IsConfid   
4          0.252851                  8           4345            IsConfid   

   EngagementScore  
0        -0.267986  
1        -0.262384  
2         0.078242  
3         0.049025  
4        -0.201495  


In [28]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error: {mse_rf}")

# Feature importance from random forest
feature_importance_rf = rf_model.feature_importances_
for feature, importance in zip(X.columns, feature_importance_rf):
    print(f"{feature}: {importance}")


Random Forest Mean Squared Error: 0.10503593750000001
WebsiteVisits: 0.15627834747602676
PagesPerVisit: 0.23435613314220335
TimeOnSite: 0.2271565704996011
SocialShares: 0.15987116118871972
EmailOpens: 0.12352994972846348
EmailClicks: 0.0988078379649855


In [29]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predict on test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
print(f"Gradient Boosting Mean Squared Error: {mse_gb}")

# Feature importance from gradient boosting
feature_importance_gb = gb_model.feature_importances_
for feature, importance in zip(X.columns, feature_importance_gb):
    print(f"{feature}: {importance}")


Gradient Boosting Mean Squared Error: 0.0976093628601278
WebsiteVisits: 0.12459616458448862
PagesPerVisit: 0.2451339288569527
TimeOnSite: 0.23276532469616076
SocialShares: 0.03721637536346161
EmailOpens: 0.1753168348090558
EmailClicks: 0.1849713716898806
