In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from datetime import datetime, timedelta

In [None]:
# Loading all datasets
meta_df = pd.read_parquet('Platform/meta.parquet')
snapchat_df = pd.read_parquet('Platform/snapchat.parquet')
tiktok_df = pd.read_parquet('Platform/tiktok.parquet')
youtube_df = pd.read_parquet('Platform/youtube.parquet')

# basic information
print("Meta Dataset Info:")
print(meta_df.info())
print("\nMeta Dataset Sample:")
print(meta_df.head())

print("\nSnapchat Dataset Info:")
print(snapchat_df.info())
print("\nSnapchat Dataset Sample:")
print(snapchat_df.head())

Meta Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2109884 entries, 0 to 2109883
Data columns (total 11 columns):
 #   Column             Dtype         
---  ------             -----         
 0   campaign_id        object        
 1   Startdatum         datetime64[ms]
 2   Endedatum          datetime64[ms]
 3   advertiser_name    object        
 4   impressions        int64         
 5   clicks             int64         
 6   trueview_views     int64         
 7   video_completions  int64         
 8   date               datetime64[ms]
 9   advertiser_id      object        
 10  device_type        object        
dtypes: datetime64[ms](3), int64(4), object(4)
memory usage: 177.1+ MB
None

Meta Dataset Sample:
  campaign_id Startdatum  Endedatum advertiser_name  impressions  clicks  \
0    52045241 2017-10-01 2017-10-04        HMFRA_HH        10289      24   
1    52045241 2017-10-01 2017-10-04        HMFRA_HH         1117       1   
2    52045241 2017-10-01 2017-10-04

In [64]:
# Get unique advertiser names for each platform
print("Meta Unique Advertisers:")
print(meta_df['advertiser_name'].unique())
print("\nSnapchat Unique Advertisers:")
print(snapchat_df['advertiser_name'].unique())
print("\nTikTok Unique Advertisers:")
print(tiktok_df['advertiser_name'].unique())
print("\nYouTube Unique Advertisers:")
print(youtube_df['advertiser_name'].unique())

# Get count of unique advertisers for each platform
print("\nNumber of unique advertisers per platform:")
print(f"Meta: {meta_df['advertiser_name'].nunique()}")
print(f"Snapchat: {snapchat_df['advertiser_name'].nunique()}")
print(f"TikTok: {tiktok_df['advertiser_name'].nunique()}")
print(f"YouTube: {youtube_df['advertiser_name'].nunique()}")

Meta Unique Advertisers:
['HMFRA_HH' 'HMFRA_HM' 'HMFRA_JD' 'HMFRA_KI' 'HMFRA_Re' 'HMFRA_TE'
 'HMFRA_NO' 'HMFRA_[D']

Snapchat Unique Advertisers:
['HMFRA_KI' 'HMFRA_Va' 'HMFRA_Bu' 'HMFRA_As' 'HMFRA_Ha' 'HMFRA_My'
 'HMFRA_AM' 'HMFRA_TE' 'HMFRA_AO']

TikTok Unique Advertisers:
['HMFRA_De' 'HMFRA_Ra' 'HMFRA_At' 'HMFRA_Ni' 'HMFRA_My' 'HMFRA_Sc'
 'HMFRA_Ki' 'HMFRA_Te' 'HMFRA_EC' 'HMFRA_LO' 'HMFRA_Ba' 'HMFRA_Ao'
 'HMFRA_vi' 'HMFRA_Ad' 'HMFRA_Vi' 'HMFRA_Lo' 'HMFRA_Sw']

YouTube Unique Advertisers:
['HMFRA_KP' 'HMFRA_TE' 'HMFRA_NI' 'HMFRA_We' 'HMFRA_AO' 'HMFRA_DE'
 'HMFRA_SC' 'HMFRA_Lo' 'HMFRA_Mc' 'HMFRA_RO' 'HMFRA_Ra' 'HMFRA_As'
 'HMFRA_Ta' 'HMFRA_Ec' 'HMFRA_Me' 'HMFRA_ST' 'HMFRA_KI' 'HMFRA_Pe'
 'HMFRA_Va']

Number of unique advertisers per platform:
Meta: 8
Snapchat: 9
TikTok: 17
YouTube: 19


In [44]:
print("\nTikTok Dataset Info:")
print(tiktok_df.info())
print("\nTikTok Dataset Sample:")
print(tiktok_df.head())

print("\nYouTube Dataset Info:")
print(youtube_df.info())
print("\nYouTube Dataset Sample:")
print(youtube_df.head())


TikTok Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95744 entries, 0 to 95743
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   campaign_id        95744 non-null  object
 1   advertiser_name    95744 non-null  object
 2   date               95744 non-null  object
 3   clicks             95744 non-null  int64 
 4   video_completions  95744 non-null  int64 
 5   impressions        95744 non-null  int64 
 6   device_type        95744 non-null  object
 7   advertiser_id      95744 non-null  object
 8   device_type (#1)   95744 non-null  object
dtypes: int64(3), object(6)
memory usage: 6.6+ MB
None

TikTok Dataset Sample:
  campaign_id advertiser_name        date  clicks  video_completions  \
0      261569        HMFRA_De  2017-01-03       0                  0   
1      250659        HMFRA_Ra  2017-01-03       0                  0   
2      224138        HMFRA_At  2017-01-07       0         

In [45]:
# Converting strings dates to datetime format
snapchat_df['date'] = pd.to_datetime(snapchat_df['date_start'])
tiktok_df['date'] = pd.to_datetime(tiktok_df['date'])
youtube_df['date'] = pd.to_datetime(youtube_df['date'])

# date range
print("Meta date range:", meta_df['date'].min(), "to", meta_df['date'].max())
print("Snapchat date range:", snapchat_df['date'].min(), "to", snapchat_df['date'].max())
print("TikTok date range:", tiktok_df['date'].min(), "to", tiktok_df['date'].max())
print("YouTube date range:", youtube_df['date'].min(), "to", youtube_df['date'].max())

Meta date range: 2017-01-08 00:00:00 to 2017-12-31 00:00:00
Snapchat date range: 2017-01-01 00:00:00 to 2017-12-31 00:00:00
TikTok date range: 2017-01-03 00:00:00 to 2017-12-31 00:00:00
YouTube date range: 2017-01-18 00:00:00 to 2017-12-31 00:00:00


In [46]:
# Renaming YouTube columns for consistency
youtube_df = youtube_df.rename(columns={
    'account_name=advertiser_name': 'advertiser_name',
    'line_item_id': 'campaign_id'
})
print( youtube_df.columns.tolist())

['date', 'impressions', 'clicks', 'advertiser_name', 'campaign_id', 'video_completions', 'advertiser_id', 'device_type']


In [47]:
# Check missing values
print("Missing values in Meta:", meta_df.isnull().sum().sum())
print("Missing values in Snapchat:", snapchat_df.isnull().sum().sum())
print("Missing values in TikTok:", tiktok_df.isnull().sum().sum())
print("Missing values in YouTube:", youtube_df.isnull().sum().sum())

# Check duplicates
print("Duplicates in Meta:", meta_df.duplicated().sum())
print("Duplicates in Snapchat:", snapchat_df.duplicated().sum())
print("Duplicates in TikTok:", tiktok_df.duplicated().sum())
print("Duplicates in YouTube:", youtube_df.duplicated().sum())

Missing values in Meta: 0
Missing values in Snapchat: 0
Missing values in TikTok: 0
Missing values in YouTube: 0
Duplicates in Meta: 900302
Duplicates in Snapchat: 6034
Duplicates in TikTok: 16176
Duplicates in YouTube: 991265


In [None]:
# Dropping duplicates
meta_df = meta_df.drop_duplicates()
snapchat_df = snapchat_df.drop_duplicates()
tiktok_df = tiktok_df.drop_duplicates()
youtube_df = youtube_df.drop_duplicates()

# Verify
print("Meta rows after:", len(meta_df))
print("Snapchat rows after:", len(snapchat_df))
print("TikTok rows after:", len(tiktok_df))
print("YouTube rows after:", len(youtube_df))

Meta rows after: 1209582
Snapchat rows after: 32677
TikTok rows after: 66118
YouTube rows after: 925142


In [None]:
# Unique values in both device_type columns
print("device_type values:", tiktok_df['device_type'].unique())
print("device_type (#1) values:", tiktok_df['device_type (#1)'].unique())

device_type values: ['Smartphone' 'Desktop' 'Tablet' 'SmartTV']
device_type (#1) values: ['Desktop' 'Tablet' 'SmartTV' 'Smartphone']


In [None]:
# Dropping TikTok's device_type (#1) column
# This column is redundant as it contains the same information as device_type
tiktok_df = tiktok_df.drop(columns=['device_type (#1)'])

# Renaming Snapchat's video_views_p100 to video_completions
snapchat_df = snapchat_df.rename(columns={'video_views_p100': 'video_completions'})
print("TikTok columns:", tiktok_df.columns.tolist())

TikTok columns: ['campaign_id', 'advertiser_name', 'date', 'clicks', 'video_completions', 'impressions', 'device_type', 'advertiser_id']


In [51]:
# Total video_completions for each platform
print("Meta video_completions total:", meta_df['video_completions'].sum())
print("Meta trueview_views total:", meta_df['trueview_views'].sum())
print("Snapchat video_completions total:", snapchat_df['video_completions'].sum())
print("TikTok video_completions total:", tiktok_df['video_completions'].sum())
print("YouTube video_completions total:", youtube_df['video_completions'].sum())

Meta video_completions total: 5645537682
Meta trueview_views total: 1479466715
Snapchat video_completions total: 66116156
TikTok video_completions total: 29882462
YouTube video_completions total: 194741321


In [53]:
youtube_df.head(5)

Unnamed: 0,date,impressions,clicks,advertiser_name,campaign_id,video_completions,advertiser_id,device_type
0,2017-01-18,5,0,HMFRA_KP,20096377,0,7057172,Desktop
1,2017-01-29,1,0,HMFRA_KP,20096442,0,7057172,Tablet
2,2017-01-26,23,0,HMFRA_KP,20096464,0,7057172,Smartphone
3,2017-01-19,1,0,HMFRA_KP,20096464,0,7057172,SmartTV
4,2017-01-26,0,1,HMFRA_KP,20093556,0,7057172,Tablet


In [54]:
tiktok_df.head(5)

Unnamed: 0,campaign_id,advertiser_name,date,clicks,video_completions,impressions,device_type,advertiser_id
0,261569,HMFRA_De,2017-01-03,0,0,1,Smartphone,15336
1,250659,HMFRA_Ra,2017-01-03,0,0,1,Desktop,8738821
2,224138,HMFRA_At,2017-01-07,0,0,2,Desktop,12747
3,224138,HMFRA_At,2017-01-07,0,0,1,Desktop,12747
4,267661,HMFRA_Ni,2017-01-12,0,0,162,Desktop,13346


In [55]:
snapchat_df.head(5)

Unnamed: 0,date_start,advertiser_name,video_completions,impressions,campaign_id,clicks,advertiser_id,device_type,date
0,2017-10-01,HMFRA_KI,0,4,437701,0,7676909,Smartphone,2017-10-01
1,2017-10-01,HMFRA_KI,0,4,437701,0,7676909,Tablet,2017-10-01
2,2017-10-01,HMFRA_KI,0,2,437701,0,7676909,Tablet,2017-10-01
3,2017-10-01,HMFRA_KI,0,2,437701,0,7676909,Desktop,2017-10-01
4,2017-10-01,HMFRA_KI,0,6,437701,1,7676909,Smartphone,2017-10-01


In [33]:
meta_df.head(5)

Unnamed: 0,campaign_id,Startdatum,Endedatum,advertiser_name,impressions,clicks,trueview_views,video_completions,date,advertiser_id,device_type
0,52045241,2017-10-01,2017-10-04,HMFRA_HH,10289,24,0,9524,2017-10-04,843621277,SmartTV
1,52045241,2017-10-01,2017-10-04,HMFRA_HH,1117,1,0,1032,2017-10-04,843621277,Tablet
2,52045241,2017-10-01,2017-10-04,HMFRA_HH,25286,32,0,23012,2017-10-04,843621277,Desktop
3,52045241,2017-10-01,2017-10-04,HMFRA_HH,40988,80,0,37803,2017-10-04,843621277,Tablet
4,52045241,2017-10-01,2017-10-04,HMFRA_HH,4447,16,0,4108,2017-10-04,843621277,SmartTV


In [None]:
tiktok_df.info()
youtube_df.info()
snapchat_df.info()
meta_df.info()
# Checking the data types of the columns
print("Meta Data Types:")
print(meta_df.dtypes)
print("\nSnapchat Data Types:")
print(snapchat_df.dtypes)
print("\nTikTok Data Types:")
print(tiktok_df.dtypes)
print("\nYouTube Data Types:")
print(youtube_df.dtypes)

<class 'pandas.core.frame.DataFrame'>
Index: 66118 entries, 0 to 95743
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   campaign_id        66118 non-null  object        
 1   advertiser_name    66118 non-null  object        
 2   date               66118 non-null  datetime64[ns]
 3   clicks             66118 non-null  int64         
 4   video_completions  66118 non-null  int64         
 5   impressions        66118 non-null  int64         
 6   device_type        66118 non-null  object        
 7   advertiser_id      66118 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 4.5+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 925142 entries, 0 to 1916406
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   date               925142 non-null  datetime64[ns]
 1  

### Data Integration

In [None]:
# columns needed for visualizations
visualization_metrics = ['campaign_id', 'date', 'impressions', 'clicks', 'video_completions', 'device_type', 'advertiser_name']

meta_df['date'] = meta_df['date'].astype('datetime64[ns]')

# Cource column for each platform
meta_df['source'] = 'Meta'
snapchat_df['source'] = 'Snapchat'
tiktok_df['source'] = 'TikTok'
youtube_df['source'] = 'YouTube'

# Combining datasets into one table for platform comparisons
platform_data = pd.concat([
    meta_df[visualization_metrics + ['source']],
    snapchat_df[visualization_metrics + ['source']],
    tiktok_df[visualization_metrics + ['source']],
    youtube_df[visualization_metrics + ['source']]
], ignore_index=True)

# Verify
print("Total rows in platform_data:", len(platform_data))
print("Columns in platform_data:", platform_data.columns.tolist())
print(platform_data.head())

Total rows in platform_data: 2233519
Columns in platform_data: ['campaign_id', 'date', 'impressions', 'clicks', 'video_completions', 'device_type', 'advertiser_name', 'source']
  campaign_id       date  impressions  clicks  video_completions device_type  \
0    52045241 2017-10-04        10289      24               9524     SmartTV   
1    52045241 2017-10-04         1117       1               1032      Tablet   
2    52045241 2017-10-04        25286      32              23012     Desktop   
3    52045241 2017-10-04        40988      80              37803      Tablet   
4    52045241 2017-10-04         4447      16               4108     SmartTV   

  advertiser_name source  
0        HMFRA_HH   Meta  
1        HMFRA_HH   Meta  
2        HMFRA_HH   Meta  
3        HMFRA_HH   Meta  
4        HMFRA_HH   Meta  


In [None]:
# Saving platform_data to CSV for Task 1 visualizations
platform_data.to_csv('platform_data.csv', index=False) 

In [63]:
# Save the cleaned META DataFrame to CSV
meta_df.to_csv('cleaned_meta.csv', index=False)