<a href="https://colab.research.google.com/github/MohamadHusseinIsmail/Anghami-Deliverables/blob/main/Publicis_Groupe_ETL%2BModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

media = pd.read_csv("/content/media_campaigns_big.csv")
web = pd.read_csv("/content/website_analytics_big.csv")
crm = pd.read_csv("/content/crm_sales_big.csv")
campaigns = pd.read_csv("/content/campaigns_dim_big.csv")
costs = pd.read_csv("/content/costs_fees_big.csv")


In [None]:
#Conducting EDA

def eda(df, name):
    print(f"\n===== {name} =====")
    display(df.head())
    print(df.info())
    print(df.describe(include='all'))
    print("Missing values:\n", df.isna().sum())

eda(media, "Media")
eda(web, "Web")
eda(crm, "CRM")


In [None]:
# Dates
for df in [media, web, crm, costs]:
    df["date"] = pd.to_datetime(df["date"])

# Numeric fixes
media[["impressions","clicks","spend_usd"]] = media[["impressions","clicks","spend_usd"]].apply(pd.to_numeric, errors="coerce")
crm[["leads","qualified_leads","sales","revenue_usd"]] = crm[["leads","qualified_leads","sales","revenue_usd"]].apply(pd.to_numeric, errors="coerce")

# Removing duplicates
media.drop_duplicates(inplace=True)
web.drop_duplicates(inplace=True)
crm.drop_duplicates(inplace=True)


In [None]:
media["CTR"] = media["clicks"] / media["impressions"]
media["CPC"] = media["spend_usd"] / media["clicks"]

crm["lead_to_qual"] = crm["qualified_leads"] / crm["leads"]
crm["qual_to_sale"] = crm["sales"] / crm["qualified_leads"]

# Merging media + CRM to compute ROAS / CPA
merged = media.merge(
    crm,
    left_on=["date","campaign_name","platform"],
    right_on=["date","campaign_name","lead_source"],
    how="left"
)

merged["CPA"] = merged["spend_usd"] / merged["sales"]
merged["ROAS"] = merged["revenue_usd"] / merged["spend_usd"]


In [None]:
sns.boxplot(x=merged["ROAS"])
plt.title("ROAS Distribution")
plt.show()

sns.scatterplot(data=merged, x="CTR", y="ROAS")
plt.show()

sns.heatmap(merged.corr(numeric_only=True), cmap="coolwarm")
plt.show()


In [None]:
#Data Validation

merged = merged.replace([np.inf, -np.inf], np.nan)
merged.dropna(inplace=True)

# Removing extreme ROAS outliers
q1 = merged["ROAS"].quantile(0.01)
q99 = merged["ROAS"].quantile(0.99)
merged = merged[(merged["ROAS"] > q1) & (merged["ROAS"] < q99)]


In [None]:
merged["high_roas"] = (merged["ROAS"] > merged["ROAS"].median()).astype(int)

features = [
    "impressions","clicks","spend_usd",
    "CTR","CPC","leads","qualified_leads","sales"
]

X = merged[features]
y = merged["high_roas"]


In [None]:

# Splitting data between train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print(classification_report(y_test, lr.predict(X_test)))


In [None]:
#Random forest Algorithm

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

print(classification_report(y_test, xgb.predict(X_test)))


In [None]:
importances = pd.Series(rf.feature_importances_, index=features)
importances.sort_values().plot(kind="barh")
plt.title("Feature Importance")
plt.show()


In [None]:
#Support Vector Machine
from sklearn.svm import SVC

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)

print(classification_report(y_test, svm.predict(X_test)))


In [None]:
#K-Means Algorithm for Campaign Segmentation

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

seg_features = merged[
    ["CTR","CPC","ROAS","leads","sales"]
].dropna()

X_scaled = StandardScaler().fit_transform(seg_features)

kmeans = KMeans(n_clusters=4, random_state=42)
seg_features["cluster"] = kmeans.fit_predict(X_scaled)

seg_features.groupby("cluster").mean()


In [None]:
# Regression algorithm

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X_reg = merged[features]
y_reg = merged["ROAS"]

Xtr, Xte, ytr, yte = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

rf_reg = RandomForestRegressor()
rf_reg.fit(Xtr, ytr)

preds = rf_reg.predict(Xte)
print("MAE:", mean_absolute_error(yte, preds))


In [None]:
#Gradient Boosting Algorithm
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

print(classification_report(y_test, gb.predict(X_test)))


In [None]:
#Decision Tree Algorithm
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=6, random_state=42)
dt.fit(X_train, y_train)

print(classification_report(y_test, dt.predict(X_test)))


In [None]:
# We end up with visualization outputs

plt.figure(figsize=(8,5))
sns.scatterplot(data=merged, x="spend_usd", y="ROAS", hue="platform", alpha=0.6)
plt.title("ROAS vs Spend by Platform")
plt.xlabel("Spend (USD)")
plt.ylabel("ROAS")
plt.show()


In [None]:
funnel_rates = merged.groupby("platform")[["leads","qualified_leads","sales"]].sum()
funnel_rates["lead_to_sale_rate"] = funnel_rates["sales"] / funnel_rates["leads"]

funnel_rates["lead_to_sale_rate"].plot(kind="bar", figsize=(7,4))
plt.title("Lead to Sale Conversion Rate by Platform")
plt.ylabel("Rate")
plt.show()


In [None]:
importances = pd.Series(rf.feature_importances_, index=features)
importances.sort_values().plot(kind="barh", figsize=(7,4))
plt.title("Feature Importance for High ROAS Prediction")
plt.show()



In [None]:
summary = merged.groupby("platform").agg({
    "spend_usd":"sum",
    "revenue_usd":"sum",
    "ROAS":"mean",
    "sales":"sum"
}).round(2)

summary

