### Making the API Call to Get IHC Data

In [None]:
from api_key import IHC_API_KEY

In [None]:
import sqlite3
import pandas as pd

db_path = "data/challenge.db"
conn = sqlite3.connect(db_path)

In [None]:
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(query, conn)
tables

In [None]:
query = "PRAGMA table_info(attribution_customer_journey);"
schema = pd.read_sql_query(query, conn)
print(schema)

In [None]:
import json

with open('customer_journeys.json', 'r') as f:
    api_data = json.load(f)

print(api_data['customer_journeys'])

In [None]:
def chunk_data(data, chunk_size):
    """Split data into smaller chunks."""
    keys = list(data.keys())  
    for i in range(0, len(keys), chunk_size):
        yield {k: data[k] for k in keys[i:i + chunk_size]}

chunks = list(chunk_data(api_data['customer_journeys'], 100))
print(f"Total Chunks: {len(chunks)}")

In [None]:
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}: {len(chunk)} customer journeys")

In [None]:
import requests
import json

api_url = "https://api.ihc-attribution.com/v1/compute_ihc?conv_type_id=data_challenge"
api_key = IHC_API_KEY

In [None]:
test_data = [
    {
        "conversion_id": "00179676da577b9b60ddf4b5dc8af836d4314f74b83aa277a8549254d85aa82f",
        "session_id": "7da58cbf848ffde3f453d0abec375d9ceeb8b948ed9122d4d94bb63fcc9cec49",
        "timestamp": "2023-09-01 13:01:28",
        "channel_label": "FB & IG Ads",
        "holder_engagement": 1,
        "closer_engagement": 1,
        "conversion": 1,
        "impression_interaction": 0
    }
]

test_request = {
    "customer_journeys": test_data,
    "redistribution_parameter": api_data["redistribution_parameter"]
}

response = requests.post(
    api_url,
    data=json.dumps(test_request),
    headers={"Content-Type": "application/json", "x-api-key": api_key}
)

print(f"Status Code: {response.status_code}")
try:
    response_data = response.json()
    print(json.dumps(response_data, indent=4))
except json.JSONDecodeError:
    print("Invalid JSON response")

In [None]:
processed_results = []

# Iterate through chunks
for idx, chunk in enumerate(chunks, start=1):
    print(f"Processing Chunk {idx}/{len(chunks)}...")
    
    body = {
        "customer_journeys": chunk,
        "redistribution_parameter": api_data["redistribution_parameter"]
    }

    try:
        response = requests.post(
            api_url,
            data=json.dumps(body),
            headers={"Content-Type": "application/json", "x-api-key": api_key}
        )
        response.raise_for_status()  

        response_data = response.json()
        print(f"Chunk {idx} Status Code: {response_data['statusCode']}")

        if "value" in response_data:
            processed_results.extend(response_data["value"])
            print(f"Sessions Processed in Chunk {idx}: {len(response_data['value'])}")

        if "partialFailureErrors" in response_data and response_data["partialFailureErrors"]:
            print(f"Errors in Chunk {idx}: {response_data['partialFailureErrors']}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to process Chunk {idx}: {e}")

print(f"Total Sessions Processed: {len(processed_results)}")

In [None]:
print(json.dumps(processed_results[:3], indent=4))

In [None]:
import pandas as pd

results_df = pd.DataFrame(processed_results)
print(results_df.head())

In [None]:
results_df = results_df.rename(columns={"conversion_id": "conv_id"})

In [None]:
print(results_df.head())

In [None]:
results_df = results_df[["conv_id", "session_id", "ihc"]]

In [None]:
results_df.to_sql("attribution_customer_journey", conn, if_exists="append", index=False)

query = "SELECT * FROM attribution_customer_journey LIMIT 5;"
inserted_data = pd.read_sql_query(query, conn)
print(inserted_data)

In [None]:
session_sources_df = pd.read_sql_query("SELECT * FROM session_sources", conn)
session_costs_df = pd.read_sql_query("SELECT * FROM session_costs", conn)
conversions_df = pd.read_sql_query("SELECT * FROM conversions", conn)
attribution_journey_df = pd.read_sql_query("SELECT * FROM attribution_customer_journey", conn)

merged_df = session_sources_df.merge(session_costs_df, on="session_id", how="left")

merged_df = merged_df.merge(attribution_journey_df, on="session_id", how="inner")

merged_df = merged_df.merge(conversions_df, on="conv_id", how="inner")

print(merged_df.head())

In [None]:
# Group the data by channel name and event date
channel_reporting = (
    merged_df
    .groupby(["channel_name", "event_date"])
    .agg(
        total_cost=("cost", "sum"),
        total_ihc=("ihc", "sum"),
        total_ihc_revenue=("revenue", lambda x: (merged_df.loc[x.index, "ihc"] * x).sum())
    )
    .reset_index()
)

channel_reporting["CPO"] = channel_reporting["total_cost"] / channel_reporting["total_ihc"]

# Skip 'Organic Traffic' and 'Direct Traffic' from ROAS calculations
channel_reporting["ROAS"] = channel_reporting.apply(
    lambda row: 'N/A' if row["channel_name"] in ['Organic Traffic', 'Direct Traffic'] or row["total_cost"] == 0 
                else round(row["total_ihc_revenue"] / row["total_cost"], 2), axis=1
)

channel_reporting["CPO"] = channel_reporting["CPO"].round(2)

channel_reporting.to_sql("channel_reporting", conn, if_exists="replace", index=False)

print(channel_reporting.head())

In [None]:
channel_reporting.to_csv("channel_reporting.csv", index=False)

In [None]:
import pandas as pd

file_path = "/Users/vee/Desktop/ihc_data_pipeline/channel_reporting.csv"
df = pd.read_csv(file_path)

print(df.head())

#### This scatter plot shows the relationship between total cost and total revenue for each marketing channel.	Channels with high total costs but low revenues are clustered towards the bottom. These represent inefficient spending, where more cost is incurred but revenue isn’t scaling significantly.
#### Channels with high revenue and moderate costs show efficient marketing spend. They represent high-performing channels in terms of revenue generation with reasonable investment.

In [None]:
import plotly.express as px
fig = px.bar(
    df.groupby('channel_name').agg(
        total_ihc=('total_ihc', 'sum')
    ).sort_values(by='total_ihc', ascending=False).reset_index(), 
    x='channel_name',  
    y='total_ihc',
    title="IHC Distribution Across Channels",
    labels={"total_ihc": "Total IHC", "channel_name": "Marketing Channel"}
)

fig.show()

#### This bar chart shows how the total IHC Initializer, Holder, and Closer stages is distributed across channels.
#### 	Performance Max dominate the chart with the highest total IHC values, indicating that this channels contribute significantly across the customer journey Initializer, Holder, and Closer stages.
#### Channels like TikTok Ads and Microsoft Ads contribute less to the IHC, suggesting they have a smaller impact on driving customers through these stages.

In [None]:
fig = px.bar(
    df.groupby('channel_name').agg(
        CPO=('CPO', 'mean')
    ).sort_values(by='CPO', ascending=False).reset_index(), 
    x='channel_name',  
    y='CPO',  
    title="Cost Per Order Across Channels",
    labels={"CPO": "Cost Per Order (€)", "channel_name": "Marketing Channel"}
)

fig.show()

#### This chart shows the cost per order (CPO) for each channel.
####  TikTok Ads and FB & IG Ads have the highest CPO, indicating they are more expensive to acquire a customer compared to other channels like Referral or Social Organic, which have lower CPO values.
#### Channels with higher CPO may need optimization, either by reducing costs or increasing conversion efficiency

In [None]:
channel_reporting["ROAS"] = pd.to_numeric(channel_reporting["ROAS"], errors='coerce')

channel_reporting = channel_reporting.dropna(subset=["ROAS"])

channel_roas = (
    channel_reporting.groupby('channel_name')
    .agg(ROAS=('ROAS', 'mean'))
    .reset_index()
    .sort_values(by='ROAS', ascending=False) 
)

fig = px.bar(
    channel_roas,
    x='channel_name', 
    y='ROAS',  
    title="Return on Ad Spend (ROAS) by Channel",
    labels={"ROAS": "ROAS", "channel_name": "Marketing Channel"}
)

fig.show()

#### 	The chart shows ROAS (Return on Ad Spend) for each channel.
#### Microsoft Ads and paid search have higher ROAS, suggesting that these channels are highly efficient in generating revenue relative to the cost.
#### TikTok Ads and FB & IG Ads have lower ROAS, suggesting lower revenue generation per euro spent on these channels, which may require further analysis and adjustment.

In [None]:
df['event_date'] = pd.to_datetime(df['event_date'])

# Line plot
fig = px.line(
    df,
    x='event_date',
    y='total_ihc_revenue',
    title="Total Revenue Over Time",
    labels={"event_date": "Event Date", "total_ihc_revenue": "Total Revenue (€)"}
)
fig.update_traces(mode='lines+markers')
fig.show()

#### This line chart shows how the total revenue fluctuated over time.
#### Revenue spikes on certain dates Sept 4, 2023 could be specific campaigns or events that led to higher conversions. This could be attributed to specific marketing pushes or seasonal trends.

In [None]:
# Bar chart
fig = px.bar(
    df,
    x='channel_name',
    y='total_cost',
    title="Total Marketing Costs by Channel",
    labels={"channel_name": "Channel Name", "total_cost": "Total Cost (€)"},
    color='total_cost',
    color_continuous_scale='Viridis'
)
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.show()

#### The graph shows that Performance Max is the highest spender in terms of total marketing costs, with the bar extending close to 500 EUR. This suggests that a significant portion of the marketing budget is being allocated here.

In [None]:
top_performing_channels = df.groupby('channel_name').agg(
    total_revenue=('total_ihc_revenue', 'sum'),
    ROAS=('ROAS', 'mean')
).sort_values(by='total_revenue', ascending=False).head(10) 

fig = px.bar(
    top_performing_channels,  
    x=top_performing_channels.index,
    y='total_revenue',
    title="Top 10 Channels by Total Revenue",
    labels={"total_revenue": "Total Revenue (€)", "channel_name": "Marketing Channel"}
)
fig.show()

#### Direct Traffic and Newsletter appear to be the most successful channels in terms of revenue generation. This might suggest that organic or direct interactions with the brand such as returning visitors or email marketing campaigns are key contributors.


In [None]:
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(query).fetchall()

for table in tables:
    table_name = table[0]
    count_query = f"SELECT COUNT(*) FROM {table_name}"
    row_count = conn.execute(count_query).fetchone()[0]
    print(f"Table: {table_name}, Row count: {row_count}")
