In [12]:
# importar la spark session
from pyspark.sql import SparkSession

# crar sessio.
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [13]:
campaign_df = spark.read.csv("./inputs/campaigns.csv", header=True, inferSchema=True)
campaign_df.show(truncate=False)

+-----------+----------------------------+
|Campaign_ID|Campaign_Name               |
+-----------+----------------------------+
|1001       |Campaign-1-Brand B-Awareness|
|1002       |Campaign-2-Brand A-Launch   |
|1003       |Campaign-3-Brand C-Awareness|
|1004       |Campaign-4-Brand B-Awareness|
|1005       |Campaign-5-Brand B-Awareness|
+-----------+----------------------------+



In [14]:
from pyspark.sql.functions import col, concat, lit, split

# SPLIT Nombre_Campana in 4 columns splited by - using split function and col function
campaign_df = campaign_df.withColumn("prefijo", split(campaign_df["Campaign_Name"], "-")[0])
campaign_df = campaign_df.withColumn("id", split(campaign_df["Campaign_Name"], "-")[1])
campaign_df = campaign_df.withColumn("marca", split(campaign_df["Campaign_Name"], "-")[2])
campaign_df = campaign_df.withColumn("tipo", split(campaign_df["Campaign_Name"], "-")[3])
# Create column campaign_name concatenating column prefijo + "_" + column id
campaign_df = campaign_df.withColumn("campaign_name", concat(col("prefijo"), lit("_") , col("id")))
campaign_df =  campaign_df.drop("Campaign_Name").drop("prefijo")
campaign_df = campaign_df.withColumnRenamed("Campaign_ID", "campaign_id")
# campaign_df withColumn "campaign_name", campaign_df["prefijo"] + "_" + campaign_df["id"]
campaign_df.show(truncate=False)

+-----------+---+-------+---------+
|campaign_id|id |marca  |tipo     |
+-----------+---+-------+---------+
|1001       |1  |Brand B|Awareness|
|1002       |2  |Brand A|Launch   |
|1003       |3  |Brand C|Awareness|
|1004       |4  |Brand B|Awareness|
|1005       |5  |Brand B|Awareness|
+-----------+---+-------+---------+



In [15]:
campaign_df.printSchema()

root
 |-- campaign_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- marca: string (nullable = true)
 |-- tipo: string (nullable = true)



In [16]:
group_df = spark.read.csv("./inputs/ad_groups.csv", header=True, inferSchema=True)
group_df.show(truncate=False)

+-----------+-----------+--------------+
|Ad_Group_ID|Campaign_ID|Ad_Group_Name |
+-----------+-----------+--------------+
|2001       |1002       |Group-1-Women |
|2002       |1003       |Group-2-Youth |
|2003       |1004       |Group-3-Women |
|2004       |1002       |Group-4-Women |
|2005       |1004       |Group-5-Youth |
|2006       |1003       |Group-6-Men   |
|2007       |1003       |Group-7-Women |
|2008       |1004       |Group-8-Women |
|2009       |1005       |Group-9-Youth |
|2010       |1004       |Group-10-Men  |
|2011       |1003       |Group-11-Youth|
|2012       |1005       |Group-12-Youth|
|2013       |1002       |Group-13-Youth|
|2014       |1002       |Group-14-Men  |
|2015       |1004       |Group-15-Youth|
+-----------+-----------+--------------+



In [17]:
group_df.printSchema()

root
 |-- Ad_Group_ID: integer (nullable = true)
 |-- Campaign_ID: integer (nullable = true)
 |-- Ad_Group_Name: string (nullable = true)



In [18]:
group_df = group_df.withColumnRenamed("Ad_Group_ID", "ad_group_id")
group_df = group_df.withColumnRenamed("Campaign_ID", "campaign_id")
group_df = group_df.withColumn("id", split(group_df["Ad_Group_Name"], "-")[1])
group_df = group_df.withColumn("audience", split(group_df["Ad_Group_Name"], "-")[2])
group_df = group_df.withColumn("prefijo", split(group_df["Ad_Group_Name"], "-")[0])
group_df = group_df.withColumn("ad_group_name", concat(col("prefijo"), lit("_"), col("id")))
group_df = group_df.drop("prefijo")
group_df.show(truncate=False)

+-----------+-----------+-------------+---+--------+
|ad_group_id|campaign_id|ad_group_name|id |audience|
+-----------+-----------+-------------+---+--------+
|2001       |1002       |Group_1      |1  |Women   |
|2002       |1003       |Group_2      |2  |Youth   |
|2003       |1004       |Group_3      |3  |Women   |
|2004       |1002       |Group_4      |4  |Women   |
|2005       |1004       |Group_5      |5  |Youth   |
|2006       |1003       |Group_6      |6  |Men     |
|2007       |1003       |Group_7      |7  |Women   |
|2008       |1004       |Group_8      |8  |Women   |
|2009       |1005       |Group_9      |9  |Youth   |
|2010       |1004       |Group_10     |10 |Men     |
|2011       |1003       |Group_11     |11 |Youth   |
|2012       |1005       |Group_12     |12 |Youth   |
|2013       |1002       |Group_13     |13 |Youth   |
|2014       |1002       |Group_14     |14 |Men     |
|2015       |1004       |Group_15     |15 |Youth   |
+-----------+-----------+-------------+---+---

In [19]:
ads_df = spark.read.csv("./inputs/ads.csv", header=True, inferSchema=True)
ads_df.show(truncate=False)

+-----+-----------+--------------+----------------+
|Ad_ID|Ad_Group_ID|Ad_Name       |Platform        |
+-----+-----------+--------------+----------------+
|3001 |2001       |Ad-1-Carousel |Facebook        |
|3002 |2010       |Ad-2-Image    |Instagram       |
|3003 |2004       |Ad-3-Carousel |Audience Network|
|3004 |2003       |Ad-4-Video    |Audience Network|
|3005 |2004       |Ad-5-Carousel |Audience Network|
|3006 |2012       |Ad-6-Image    |Facebook        |
|3007 |2008       |Ad-7-Carousel |Facebook        |
|3008 |2013       |Ad-8-Carousel |Facebook        |
|3009 |2011       |Ad-9-Video    |Facebook        |
|3010 |2013       |Ad-10-Image   |Audience Network|
|3011 |2002       |Ad-11-Carousel|Facebook        |
|3012 |2015       |Ad-12-Video   |Audience Network|
|3013 |2007       |Ad-13-Video   |Facebook        |
|3014 |2009       |Ad-14-Image   |Instagram       |
|3015 |2003       |Ad-15-Image   |Audience Network|
|3016 |2010       |Ad-16-Video   |Facebook        |
|3017 |2003 

In [20]:
ads_df.printSchema()

root
 |-- Ad_ID: integer (nullable = true)
 |-- Ad_Group_ID: integer (nullable = true)
 |-- Ad_Name: string (nullable = true)
 |-- Platform: string (nullable = true)



In [21]:
ads_df = ads_df.withColumnRenamed("Ad_ID", "ad_id")
ads_df = ads_df.withColumnRenamed("Ad_Group_ID", "ad_group_id")
ads_df = ads_df.withColumn("id", split(ads_df["Ad_Name"], "-")[1])
ads_df = ads_df.withColumn("prefijo", split(ads_df["Ad_Name"], "-")[0])
ads_df = ads_df.withColumn("ad_type", split(ads_df["Ad_Name"], "-")[2])
ads_df = ads_df.withColumn("ad_name", concat(col("prefijo"), lit("_"), col("id")))
ads_df = ads_df.drop("prefijo")
ads_df.show(truncate=False)

+-----+-----------+-------+----------------+---+--------+
|ad_id|ad_group_id|ad_name|Platform        |id |ad_type |
+-----+-----------+-------+----------------+---+--------+
|3001 |2001       |Ad_1   |Facebook        |1  |Carousel|
|3002 |2010       |Ad_2   |Instagram       |2  |Image   |
|3003 |2004       |Ad_3   |Audience Network|3  |Carousel|
|3004 |2003       |Ad_4   |Audience Network|4  |Video   |
|3005 |2004       |Ad_5   |Audience Network|5  |Carousel|
|3006 |2012       |Ad_6   |Facebook        |6  |Image   |
|3007 |2008       |Ad_7   |Facebook        |7  |Carousel|
|3008 |2013       |Ad_8   |Facebook        |8  |Carousel|
|3009 |2011       |Ad_9   |Facebook        |9  |Video   |
|3010 |2013       |Ad_10  |Audience Network|10 |Image   |
|3011 |2002       |Ad_11  |Facebook        |11 |Carousel|
|3012 |2015       |Ad_12  |Audience Network|12 |Video   |
|3013 |2007       |Ad_13  |Facebook        |13 |Video   |
|3014 |2009       |Ad_14  |Instagram       |14 |Image   |
|3015 |2003   

In [22]:
daily_df = spark.read.csv("./inputs/daily_data.csv", header=True, inferSchema=True)
daily_df.withColumnRenamed("Ad_ID", "ad_id")
daily_df.show(truncate=False)

+----------+-----+----------------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+
|Date      |Ad_ID|Platform        |Impressions|Clicks|Interactions|Conversions|Quartile_25|Quartile_50|Quartile_75|Completed|Spend |
+----------+-----+----------------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+
|2022-01-01|3001 |Facebook        |1419       |287   |65          |13         |486        |457        |422        |248      |99.61 |
|2022-01-01|3002 |Instagram       |4844       |1676  |3303        |1613       |1739       |1545       |1544       |1079     |437.8 |
|2022-01-01|3003 |Audience Network|2813       |125   |490         |71         |382        |381        |355        |312      |50.44 |
|2022-01-01|3004 |Audience Network|2658       |128   |283         |86         |767        |581        |308        |228      |32.68 |
|2022-01-01|3005 |Audience Network|4787       |283   |247         |75

In [23]:
# join daily_df with ads_df using ad_id and Platform
daily_df = daily_df.join(ads_df, ["ad_id", "Platform"], "left")
daily_df.show(truncate=False)

+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-----------+-------+---+--------+
|Ad_ID|Platform        |Date      |Impressions|Clicks|Interactions|Conversions|Quartile_25|Quartile_50|Quartile_75|Completed|Spend |ad_group_id|ad_name|id |ad_type |
+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-----------+-------+---+--------+
|3001 |Facebook        |2022-01-01|1419       |287   |65          |13         |486        |457        |422        |248      |99.61 |2001       |Ad_1   |1  |Carousel|
|3002 |Instagram       |2022-01-01|4844       |1676  |3303        |1613       |1739       |1545       |1544       |1079     |437.8 |2010       |Ad_2   |2  |Image   |
|3003 |Audience Network|2022-01-01|2813       |125   |490         |71         |382        |381        |355        |312      |50.44 |2004       |Ad_3   |3  |Carousel|
|300

In [24]:
daily_df = daily_df.join(group_df, ["ad_group_id"], "left")
daily_df.show(truncate=False)


+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-----------+-------------+---+--------+
|ad_group_id|Ad_ID|Platform        |Date      |Impressions|Clicks|Interactions|Conversions|Quartile_25|Quartile_50|Quartile_75|Completed|Spend |ad_name|id |ad_type |campaign_id|ad_group_name|id |audience|
+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-----------+-------------+---+--------+
|2001       |3001 |Facebook        |2022-01-01|1419       |287   |65          |13         |486        |457        |422        |248      |99.61 |Ad_1   |1  |Carousel|1002       |Group_1      |1  |Women   |
|2010       |3002 |Instagram       |2022-01-01|4844       |1676  |3303        |1613       |1739       |1545       |1544       |1079     |437.8 |Ad_2   |2  |Image   |1004       |Gro

In [25]:

campaign_df = campaign_df.withColumnRenamed("Campaign_ID", "campaign_id")
campaign_df.show(truncate=False)

+-----------+---+-------+---------+
|campaign_id|id |marca  |tipo     |
+-----------+---+-------+---------+
|1001       |1  |Brand B|Awareness|
|1002       |2  |Brand A|Launch   |
|1003       |3  |Brand C|Awareness|
|1004       |4  |Brand B|Awareness|
|1005       |5  |Brand B|Awareness|
+-----------+---+-------+---------+



In [26]:
daily_df = daily_df.join(campaign_df, ["campaign_id"], "left")
daily_df.show(truncate=False)

+-----------+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-------------+---+--------+---+-------+---------+
|campaign_id|ad_group_id|Ad_ID|Platform        |Date      |Impressions|Clicks|Interactions|Conversions|Quartile_25|Quartile_50|Quartile_75|Completed|Spend |ad_name|id |ad_type |ad_group_name|id |audience|id |marca  |tipo     |
+-----------+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-------------+---+--------+---+-------+---------+
|1002       |2001       |3001 |Facebook        |2022-01-01|1419       |287   |65          |13         |486        |457        |422        |248      |99.61 |Ad_1   |1  |Carousel|Group_1      |1  |Women   |2  |Brand A|Launch   |
|1004       |2010       |3002 |Instagram       |2022-01-01|4844       |1676  |3303        |1

In [27]:
daily_df.printSchema()

root
 |-- campaign_id: integer (nullable = true)
 |-- ad_group_id: integer (nullable = true)
 |-- Ad_ID: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Impressions: integer (nullable = true)
 |-- Clicks: integer (nullable = true)
 |-- Interactions: integer (nullable = true)
 |-- Conversions: integer (nullable = true)
 |-- Quartile_25: integer (nullable = true)
 |-- Quartile_50: integer (nullable = true)
 |-- Quartile_75: integer (nullable = true)
 |-- Completed: integer (nullable = true)
 |-- Spend: double (nullable = true)
 |-- ad_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- ad_type: string (nullable = true)
 |-- ad_group_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- audience: string (nullable = true)
 |-- id: string (nullable = true)
 |-- marca: string (nullable = true)
 |-- tipo: string (nullable = true)



In [29]:
# convert column names to snake_case obtaining the column names from the schema an converting to dictonary
columns = daily_df.schema.names
columns = {col: col.replace(" ", "_").lower() for col in columns}
# rename columns using the dictionary
for column in columns:
    daily_df = daily_df.withColumnRenamed(column, columns[column])
daily_df.show(truncate=False)

+-----------+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-------------+---+--------+---+-------+---------+
|campaign_id|ad_group_id|ad_id|platform        |date      |impressions|clicks|interactions|conversions|quartile_25|quartile_50|quartile_75|completed|spend |ad_name|id |ad_type |ad_group_name|id |audience|id |marca  |tipo     |
+-----------+-----------+-----+----------------+----------+-----------+------+------------+-----------+-----------+-----------+-----------+---------+------+-------+---+--------+-------------+---+--------+---+-------+---------+
|1002       |2001       |3001 |Facebook        |2022-01-01|1419       |287   |65          |13         |486        |457        |422        |248      |99.61 |Ad_1   |1  |Carousel|Group_1      |1  |Women   |2  |Brand A|Launch   |
|1004       |2010       |3002 |Instagram       |2022-01-01|4844       |1676  |3303        |1

In [30]:
# reorder columns in df using the list of columns
cols = [
    "campaign_id", "ad_group_id", "ad_group_name", "ad_id", "ad_name", "platform", "date",
    "marca", "tipo", "audience", "ad_type",
    "impressions", "clicks", "spend", "interactions", "conversions", "quartile_25", "quartile_50", "quartile_75", "completed"]
# Lista de columnas del DataFrame actual
actual_cols = set(daily_df.columns)

# Lista de columnas que quieres
expected_cols = set(cols)

# Comparar
missing = expected_cols - actual_cols
extra = actual_cols - expected_cols
print(f"Missing columns: {missing}")
print(f"Extra columns: {extra}")

Missing columns: set()
Extra columns: {'id'}


In [31]:

# reorder columns with list of columns names
daily_df = daily_df.select(cols)
daily_df.show(truncate=False)

+-----------+-----------+-------------+-----+-------+----------------+----------+-------+---------+--------+--------+-----------+------+------+------------+-----------+-----------+-----------+-----------+---------+
|campaign_id|ad_group_id|ad_group_name|ad_id|ad_name|platform        |date      |marca  |tipo     |audience|ad_type |impressions|clicks|spend |interactions|conversions|quartile_25|quartile_50|quartile_75|completed|
+-----------+-----------+-------------+-----+-------+----------------+----------+-------+---------+--------+--------+-----------+------+------+------------+-----------+-----------+-----------+-----------+---------+
|1002       |2001       |Group_1      |3001 |Ad_1   |Facebook        |2022-01-01|Brand A|Launch   |Women   |Carousel|1419       |287   |99.61 |65          |13         |486        |457        |422        |248      |
|1004       |2010       |Group_10     |3002 |Ad_2   |Instagram       |2022-01-01|Brand B|Awareness|Men     |Image   |4844       |1676  |437.

In [32]:

# write the dataframe to csv format in just one file
daily_df.coalesce(1).write.mode("overwrite").csv("./outputs/mfbl/daily_data.csv", header=True)