In [None]:
!pip install imbalanced-learn xgboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [None]:
# Load data
email_df = pd.read_csv("email_table.csv")
opened_df = pd.read_csv("email_opened_table.csv")
clicked_df = pd.read_csv("link_clicked_table.csv")

In [None]:
# Label the clicked column
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

# One-hot encode categorical features
df_model = pd.get_dummies(email_df.drop(columns=['email_id']), drop_first=True)
X = df_model.drop(columns='clicked')
y = df_model['clicked']

In [None]:
# Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Train model
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Importance
importances = model.feature_importances_
features = X.columns
feat_imp = pd.Series(importances, index=features).sort_values(ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [None]:
# CTR by Segments
email_df['opened'] = email_df['email_id'].isin(opened_df['email_id']).astype(int)
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

def ctr_by_column(column):
    return email_df.groupby(column)['clicked'].mean().sort_values(ascending=False)

# Plot CTR by Email Version, Text, Country, Weekday
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sns.barplot(x=ctr_by_column('email_version').index, y=ctr_by_column('email_version').values, ax=axes[0, 0])
axes[0, 0].set_title('CTR by Email Version')

sns.barplot(x=ctr_by_column('email_text').index, y=ctr_by_column('email_text').values, ax=axes[0, 1])
axes[0, 1].set_title('CTR by Email Text')

sns.barplot(x=ctr_by_column('user_country').index, y=ctr_by_column('user_country').values, ax=axes[1, 0])
axes[1, 0].set_title('CTR by Country')
axes[1, 0].tick_params(axis='x', rotation=45)

sns.barplot(x=ctr_by_column('weekday').index, y=ctr_by_column('weekday').values, ax=axes[1, 1])
axes[1, 1].set_title('CTR by Weekday')

for ax in axes.flat:
    ax.set_ylabel("Click Through Rate")

plt.tight_layout()
plt.show()

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Set visual theme
sns.set_theme(style="whitegrid")

# Function to compute CTR
def ctr_by_column(column):
    return email_df.groupby(column)['clicked'].mean().sort_values(ascending=False)

# Compute CTRs
ctr_version = ctr_by_column('email_version')
ctr_text = ctr_by_column('email_text')
ctr_country = ctr_by_column('user_country')
ctr_weekday = ctr_by_column('weekday')

# Plot
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
bar_colors = sns.color_palette("colorblind")

# Email Version
sns.barplot(x=ctr_version.index, y=ctr_version.values, ax=axes[0, 0], palette=bar_colors)
axes[0, 0].set_title('CTR by Email Version', fontsize=14)
axes[0, 0].set_ylabel('Click Through Rate')
axes[0, 0].set_xlabel('Email Version')

# Email Text
sns.barplot(x=ctr_text.index, y=ctr_text.values, ax=axes[0, 1], palette=bar_colors)
axes[0, 1].set_title('CTR by Email Text', fontsize=14)
axes[0, 1].set_ylabel('Click Through Rate')
axes[0, 1].set_xlabel('Email Text')

# Country
sns.barplot(x=ctr_country.index, y=ctr_country.values, ax=axes[1, 0], palette=bar_colors)
axes[1, 0].set_title('CTR by Country', fontsize=14)
axes[1, 0].set_ylabel('Click Through Rate')
axes[1, 0].set_xlabel('User Country')
axes[1, 0].tick_params(axis='x', rotation=45)

# Weekday
sns.barplot(x=ctr_weekday.index, y=ctr_weekday.values, ax=axes[1, 1], palette=bar_colors)
axes[1, 1].set_title('CTR by Weekday', fontsize=14)
axes[1, 1].set_ylabel('Click Through Rate')
axes[1, 1].set_xlabel('Day of Week')
axes[1, 1].tick_params(axis='x', rotation=30)

plt.suptitle('Click-Through Rate (CTR) Analysis by Segment', fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
