In [1]:
import pandas as pd
email_df = pd.read_csv("email_table.csv")
opened_df = pd.read_csv("email_opened_table.csv")
clicked_df = pd.read_csv("link_clicked_table.csv")
#to see the first 5 rows
print(email_df.head())
print(opened_df.head())
print(clicked_df.head())

   email_id   email_text email_version  hour    weekday user_country  \
0     85120  short_email  personalized     2     Sunday           US   
1    966622   long_email  personalized    12     Sunday           UK   
2    777221   long_email  personalized    11  Wednesday           US   
3    493711  short_email       generic     6     Monday           UK   
4    106887   long_email       generic    14     Monday           US   

   user_past_purchases  
0                    5  
1                    2  
2                    2  
3                    1  
4                    6  
   email_id
0    284534
1    609056
2    220820
3    905936
4    164034
   email_id
0    609056
1    870980
2    935124
3    158501
4    177561


In [2]:
#count of rows in each table
print(f"total mails: {email_df.shape[0]}")
print(f"opened mails: {opened_df.shape[0]}")
print(f"clicked mails: {clicked_df.shape[0]}")

total mails: 100000
opened mails: 10345
clicked mails: 2119


## q1: ●	What percentage of users opened the email and what percentage clicked on the link within the email?

In [3]:
total_mails= email_df.shape[0]
opened_mails = opened_df.shape[0]
clicked_mails = clicked_df.shape[0]

open_rate = (opened_mails/total_mails)*100
clicked_rate = (clicked_mails/total_mails)*100

print(f"open_rate: {open_rate:.2f}%")
print(f"clicked_rate: {clicked_rate:.2f}%")
    

open_rate: 10.35%
clicked_rate: 2.12%


## q4: ● Did you find any interesting pattern on how the email campaign performed for different segments of users? 

In [4]:
#add 2 extra cols in email_table 
email_df["opened"] = email_df["email_id"].isin(opened_df["email_id"]).astype(int)
email_df["clicked"] = email_df["email_id"].isin(clicked_df["email_id"]).astype(int)
print(email_df.head())


   email_id   email_text email_version  hour    weekday user_country  \
0     85120  short_email  personalized     2     Sunday           US   
1    966622   long_email  personalized    12     Sunday           UK   
2    777221   long_email  personalized    11  Wednesday           US   
3    493711  short_email       generic     6     Monday           UK   
4    106887   long_email       generic    14     Monday           US   

   user_past_purchases  opened  clicked  
0                    5       0        0  
1                    2       1        1  
2                    2       0        0  
3                    1       0        0  
4                    6       0        0  


In [5]:
#click rates
print(email_df.groupby("email_text")["clicked"].mean()*100)
print(email_df.groupby("email_version")["clicked"].mean()*100)
print(email_df.groupby("hour")["clicked"].mean()*100)
print(email_df.groupby("weekday")["clicked"].mean()*100)
print(email_df.groupby("user_country")["clicked"].mean()*100)
print(email_df.groupby("user_past_purchases")["clicked"].mean()*100)

#open rates
print(email_df.groupby("email_text")["opened"].mean()*100)
print(email_df.groupby("email_version")["opened"].mean()*100)
print(email_df.groupby("hour")["opened"].mean()*100)
print(email_df.groupby("weekday")["opened"].mean()*100)
print(email_df.groupby("user_country")["opened"].mean()*100)
print(email_df.groupby("user_past_purchases")["opened"].mean()*100)



email_text
long_email     1.853767
short_email    2.387177
Name: clicked, dtype: float64
email_version
generic         1.513673
personalized    2.729409
Name: clicked, dtype: float64
hour
1     1.812801
2     1.632209
3     1.952278
4     1.618641
5     1.801252
6     1.714668
7     1.828376
8     1.893308
9     2.579435
10    2.823961
11    2.712816
12    2.566073
13    1.988891
14    2.074236
15    2.490696
16    2.319681
17    1.848917
18    1.618578
19    1.657459
20    1.219512
21    0.821918
22    1.960784
23    4.137931
24    2.898551
Name: clicked, dtype: float64
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: clicked, dtype: float64
user_country
ES    0.832748
FR    0.800400
UK    2.467526
US    2.435981
Name: clicked, dtype: float64
user_past_purchases
0       0.050443
1       1.119919
2       1.534213
3       1.656040
4       2.140929
5       2.222960
6    

## Q2:can you build a model to optimize in future how to send emails to maximize the probability of users clicking on the link inside the email?

In [10]:
##Build a Click Prediction Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Encode categorical variables
le_text = LabelEncoder()
le_version = LabelEncoder()
le_country = LabelEncoder()
le_weekday = LabelEncoder()

email_df['email_text_encoded'] = le_text.fit_transform(email_df['email_text'])
email_df['email_version_encoded'] = le_version.fit_transform(email_df['email_version'])
email_df['user_country_encoded'] = le_country.fit_transform(email_df['user_country'])
email_df['weekday_encoded'] = le_weekday.fit_transform(email_df['weekday'])

# Feature & target selection
X = email_df[['email_text_encoded', 'email_version_encoded', 'hour', 'weekday_encoded',
              'user_country_encoded', 'user_past_purchases']]
y = email_df['clicked']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# Evaluation
print(classification_report(y_test, y_pred))
print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")


              precision    recall  f1-score   support

           0       0.98      1.00      0.99     29326
           1       0.07      0.01      0.02       674

    accuracy                           0.97     30000
   macro avg       0.52      0.50      0.50     30000
weighted avg       0.96      0.97      0.97     30000

AUC Score: 0.5778


## Q3 ●	By how much do you think your model would improve click through rate (defined as # of users who click on the link/total users who receive the email). How would you test that?

In [12]:
# ✅ Add predicted probabilities to X_test
X_test = X_test.copy()  # Avoid SettingWithCopyWarning
X_test['predicted_proba'] = y_proba
X_test['actual'] = y_test.values  # Add actual outcomes for comparison

# ✅ Calculate baseline CTR (without model targeting — sending to everyone)
baseline_ctr = (X_test['actual'].sum() / X_test.shape[0]) * 100
print(f"📊 Baseline Click-Through Rate (CTR): {baseline_ctr:.2f}%")

# ✅ Select top 30% users based on predicted probability
top_30_pct = X_test[X_test['predicted_proba'] >= X_test['predicted_proba'].quantile(0.7)]

# ✅ Calculate actual CTR within the top 30% group
actual_ctr_in_top_30 = (top_30_pct['actual'].sum() / top_30_pct.shape[0]) * 100
print(f"🎯 Expected Click-Through Rate (CTR) by targeting top 30%: {actual_ctr_in_top_30:.2f}%")

# ✅ Calculate improvement factor
improvement_factor = actual_ctr_in_top_30 / baseline_ctr
print(f"📈 CTR Improvement Factor: {improvement_factor:.2f}x")

# ✅ How to test this improvement:
print("\n📝 To validate this improvement:")
print("👉 Perform an A/B test:")
print("   - Group A (control): Send emails randomly (as before)")
print("   - Group B (test): Send emails only to top 30% predicted clickers")
print("👉 Compare CTRs of both groups using a proportion z-test or chi-square test for significance.")


📊 Baseline Click-Through Rate (CTR): 2.25%
🎯 Expected Click-Through Rate (CTR) by targeting top 30%: 2.25%
📈 CTR Improvement Factor: 1.00x

📝 To validate this improvement:
👉 Perform an A/B test:
   - Group A (control): Send emails randomly (as before)
   - Group B (test): Send emails only to top 30% predicted clickers
👉 Compare CTRs of both groups using a proportion z-test or chi-square test for significance.
