In [3]:
import pandas as pd

# Task 1: Data cleaning

## 1. Identifies duplicates and remove
## 2. Validates foreign key relationships

In [47]:
campaigns = pd.read_csv('campaigns.csv', index_col=False)
conversions = pd.read_csv('conversions.csv', index_col=False)
impressions = pd.read_csv('impressions.csv', index_col=False)

In [48]:
# 1. Identify duplicate records
duplicate_campaigns = campaigns[campaigns.duplicated(keep=False)]
duplicate_conversions = conversions[conversions.duplicated(keep=False)]
duplicate_impressions = impressions[impressions.duplicated(keep=False)]

duplicate_records = pd.concat([duplicate_campaigns, duplicate_conversions, duplicate_impressions])
print(duplicate_records)

    campaign_id campaign_name channel start_date end_date  \
9       CAMP107           NaN     NaN        NaN      NaN   
10      CAMP102           NaN     NaN        NaN      NaN   
11      CAMP108           NaN     NaN        NaN      NaN   
12      CAMP102           NaN     NaN        NaN      NaN   
67      CAMP106           NaN     NaN        NaN      NaN   
..          ...           ...     ...        ...      ...   
513     CAMP108           NaN     NaN        NaN      NaN   
514     CAMP104           NaN     NaN        NaN      NaN   
515     CAMP104           NaN     NaN        NaN      NaN   
516     CAMP106           NaN     NaN        NaN      NaN   
517     CAMP106           NaN     NaN        NaN      NaN   

                            conversion_id  user_id  conversion_value  \
9    de7e697e-7429-417c-9c0e-6af270da67b9     38.0            290.25   
10   d946de16-ca45-4c5c-a8a7-cc2b4d0d76a1     15.0            101.57   
11   35a32b7b-a6c8-4dbc-81b7-68b7ea4df330     34.0 

In [49]:
# Remove duplicates from each DataFrame
campaigns = campaigns.drop_duplicates()
conversions = conversions.drop_duplicates()
impressions = impressions.drop_duplicates()

In [50]:
# 2. Validate foreign key relationships
valid_campaign_ids = campaigns['campaign_id'].unique()
invalid_conversions = conversions[~conversions['campaign_id'].isin(valid_campaign_ids)]
invalid_impressions = impressions[~impressions['campaign_id'].isin(valid_campaign_ids)]
invalid_impressions, invalid_conversions

(Empty DataFrame
 Columns: [impression_id, user_id, campaign_id, impression_ts]
 Index: [],
 Empty DataFrame
 Columns: [conversion_id, user_id, campaign_id, conversion_value, conversion_ts]
 Index: [])

# Task 2 
## 1. Keep only conversion within 30 mins & must be done in campagin duration.
## 2. Conversion must be done in campagin duration. 

In [51]:
merged_df = pd.merge(conversions, impressions, on=['campaign_id', 'user_id'])
merged_df["conversion_ts"] = pd.to_datetime(merged_df["conversion_ts"])
merged_df["impression_ts"] = pd.to_datetime(merged_df["impression_ts"])

merged_df["time_diff"] = ((merged_df["conversion_ts"] - merged_df["impression_ts"]).dt.total_seconds())/60.0
merged_30 = merged_df[(merged_df["time_diff"] >= 0) & (merged_df["time_diff"] <=30)]

merged_30 = merged_30.merge(campaigns, on="campaign_id")
merged_30 = merged_30[
    (merged_30["conversion_ts"] >= merged_30["start_date"]) &
    (merged_30["conversion_ts"] <= merged_30["end_date"])
]

# Final output
final_df = merged_30[[
    "conversion_id",
    "user_id",
    "campaign_name",
    "channel",
    "conversion_value",
    "conversion_ts",
    "time_diff"
]]

final_df

  merged_df["conversion_ts"] = pd.to_datetime(merged_df["conversion_ts"])
  merged_df["impression_ts"] = pd.to_datetime(merged_df["impression_ts"])


Unnamed: 0,conversion_id,user_id,campaign_name,channel,conversion_value,conversion_ts,time_diff
0,e3f3085a-9da8-41d3-bb1f-8dca4bc925b4,97,Campaign_7,Social,209.46,2025-03-22 12:26:00,17.0
1,569267d1-8892-4a5d-a488-44c5342b60d5,31,Campaign_4,Email,235.47,2025-03-24 20:54:00,15.0
2,16dbf663-5424-4b86-a8d8-2fd198209d52,48,Campaign_8,Social,371.93,2025-04-11 17:22:00,5.0
3,7d6d3660-d451-4526-a759-4a4582bcd5ea,18,Campaign_3,Social,494.39,2025-03-26 16:14:00,12.0
4,d946de16-ca45-4c5c-a8a7-cc2b4d0d76a1,15,Campaign_2,Display,101.57,2025-03-14 00:12:00,11.0
...,...,...,...,...,...,...,...
94,f256e5c5-4466-4912-8d4d-a0526ab3e2e1,39,Campaign_9,OOH,27.51,2025-04-01 08:02:00,15.0
95,a6881a5d-89ef-44f1-92ad-c847ca0f3e30,71,Campaign_5,OOH,184.24,2025-03-30 22:08:00,18.0
96,1aa72823-984c-474e-8d6a-0b68f33ce1b5,46,Campaign_4,Email,13.50,2025-03-13 13:03:00,25.0
97,c15ff915-c49d-47f0-a967-ce7f10a71104,45,Campaign_5,OOH,411.05,2025-03-20 01:41:00,24.0


# Task 3

## A conversion is only valid if:

### 1. The conversion_ts is within 45 minutes of the impression_ts.

### 2. Only the first valid conversion per user per campaign should be counted.

### 3. Average time of conversion per campagins should be in minutes (rounded to 2 decimal places).

In [52]:
merged_45 = merged_df[(merged_df["time_diff"] >= 0) & (merged_df["time_diff"] <=45)]
merged_45 = merged_45.sort_values(by="conversion_ts")
merged_45 = merged_45.drop_duplicates(subset=["user_id","campaign_id"], keep="first")
merged_45

Unnamed: 0,conversion_id,user_id,campaign_id,conversion_value,conversion_ts,impression_id,impression_ts,time_diff
161,045f4f73-5de9-46ab-a095-268837598e94,96,CAMP100,290.29,2025-03-01 05:34:00,8a5a473e-2532-422d-a3d6-a6dbc149f33e,2025-03-01 05:25:00,9.0
169,e290afb5-5fb5-4236-8959-4f9cb009249d,97,CAMP100,395.47,2025-03-01 08:44:00,80a8febf-9d5d-41e6-993d-80a27fef6fe8,2025-03-01 08:16:00,28.0
64,9e5c7a1d-0a83-4e67-a3ca-1eb5097be349,46,CAMP100,495.26,2025-03-01 13:40:00,45bd2f1b-8499-4c25-99cc-1d84185482a1,2025-03-01 13:08:00,32.0
3,e12fe5e1-6dc0-4214-8afc-9e51ef61bc22,40,CAMP100,124.86,2025-03-02 04:38:00,e860dc10-f741-4be4-9cb9-bc9d1c8e8f62,2025-03-02 03:54:00,44.0
180,7b92677d-0cb9-47ac-bb8f-4282492b8b5e,83,CAMP100,331.76,2025-03-04 00:23:00,71db062b-231c-4dce-8f81-6fc8811b356c,2025-03-04 00:15:00,8.0
...,...,...,...,...,...,...,...,...
218,c3dac205-7d78-43c0-af9d-e0f54ebf89f8,75,CAMP109,178.41,2025-04-14 04:38:00,9275f7ea-ca72-44fe-b6c9-86c92f785cc9,2025-04-14 03:54:00,44.0
47,6b8c2fe7-03df-4884-a306-903e8f11ee70,28,CAMP109,224.10,2025-04-14 05:00:00,553afcc4-d16b-4df3-96ab-4d7cf5e5607b,2025-04-14 04:33:00,27.0
20,2bf8cc19-60a5-458f-a20e-fbec1db56dc9,72,CAMP109,103.81,2025-04-14 18:37:00,d21c0825-7395-40a5-b2bc-03fc7d1b4701,2025-04-14 17:56:00,41.0
42,9b0aa8f7-df91-4cdd-9d35-bf3a20e0017b,69,CAMP109,44.67,2025-04-14 22:14:00,8cf7008c-0bb4-4e26-8b5f-2209273824a4,2025-04-14 22:01:00,13.0


In [46]:
avg_time_campaign = merged_45.groupby("campaign_id")["time_diff"].mean().reset_index()
avg_time_campaign["time_diff"] = avg_time_campaign["time_diff"].round(2)
avg_time_campaign

Unnamed: 0,campaign_id,time_diff
0,CAMP100,26.0
1,CAMP101,20.0
2,CAMP102,17.61
3,CAMP103,19.83
4,CAMP104,21.0
5,CAMP105,25.25
6,CAMP106,29.87
7,CAMP107,24.31
8,CAMP108,22.67
9,CAMP109,29.57
