In [61]:
import pandas as pd
import networkx as nx
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

In [62]:
# Load the dataset
df = pd.read_csv("../Datasets/Marketing campaign dataset.csv")

# Select relevant features
selected_columns = [
    'campaign_item_id', 'advertiser_id', 'advertiser_name', 'ext_service_id', 'ext_service_name',
    'creative_id', 'creative_width', 'creative_height', 'search_tags', 'template_id','channel_id', 'channel_name', 'campaign_budget_usd', 
    'media_cost_usd', 'impressions', 'clicks', 'time', 'weekday_cat','landing_page', 'network_id', 'keywords'
]

df = df[selected_columns]

# Convert time to datetime for later use
df['time'] = pd.to_datetime(df['time'])

# Display the processed df
print(df.head())

   campaign_item_id  advertiser_id advertiser_name  ext_service_id  \
0              2733           4756             Web             128   
1              2733           4756             Web              16   
2              2733           4756             Web             128   
3              2733           4756             Web             128   
4              2733           4756             Web               4   

  ext_service_name  creative_id  creative_width  creative_height  \
0     Facebook Ads         1000           300.0            250.0   
1            DV360         1000           300.0            250.0   
2     Facebook Ads         1000           300.0            250.0   
3     Facebook Ads         1000           300.0            250.0   
4       Google Ads         1000           300.0            250.0   

                          search_tags  template_id  ...  channel_name  \
0                     #The Power of X         90.0  ...        Mobile   
1                      #

In [63]:
# Check for null values in the dfset
null_values = df.isnull().sum()

# Display columns with null values (if any)
print("Null values in each column:\n", null_values)

# To check if there are any rows with missing values in the entire dfset
print("Total rows with any missing values:", df.isnull().any(axis=1).sum())

Null values in each column:
 campaign_item_id          0
advertiser_id             0
advertiser_name           0
ext_service_id            0
ext_service_name          0
creative_id               0
creative_width         3412
creative_height        3412
search_tags               0
template_id            3412
channel_id                0
channel_name              0
campaign_budget_usd       0
media_cost_usd            0
impressions               0
clicks                    0
time                      0
weekday_cat               0
landing_page              0
network_id                0
keywords                  0
dtype: int64
Total rows with any missing values: 3412


In [64]:
df['creative_dimension'] = df['creative_height'] * df['creative_width']
df = df.drop(columns=['creative_height','creative_width'])

In [65]:
# Remove rows where 'creative_dimension' is null (NaN)
df = df.dropna(subset=['creative_dimension'])

# Check the cleaned df frame
print(df.head())

   campaign_item_id  advertiser_id advertiser_name  ext_service_id  \
0              2733           4756             Web             128   
1              2733           4756             Web              16   
2              2733           4756             Web             128   
3              2733           4756             Web             128   
4              2733           4756             Web               4   

  ext_service_name  creative_id                         search_tags  \
0     Facebook Ads         1000                     #The Power of X   
1            DV360         1000                      #Be Bold. Be X   
2     Facebook Ads         1000  #Embrace Your Individuality with X   
3     Facebook Ads         1000                      #Be Bold. Be X   
4       Google Ads         1000                      #Be Bold. Be X   

   template_id  channel_id channel_name  campaign_budget_usd  media_cost_usd  \
0         90.0          32       Mobile           652.173913       14.05

In [66]:
print(df)

       campaign_item_id  advertiser_id advertiser_name  ext_service_id  \
0                  2733           4756             Web             128   
1                  2733           4756             Web              16   
2                  2733           4756             Web             128   
3                  2733           4756             Web             128   
4                  2733           4756             Web               4   
...                 ...            ...             ...             ...   
69195              3960           5857           Cairo             128   
69196              3960           5857           Cairo             128   
69197              3960           5857           Cairo             128   
69198              3960           5857           Cairo             128   
69199              3960           5857           Cairo              16   

      ext_service_name  creative_id                               search_tags  \
0         Facebook Ads        

In [67]:
encoded_channels = pd.get_dummies(df['channel_name'], prefix='channel').astype(int)
encoded_services = pd.get_dummies(df['ext_service_name'], prefix='service').astype(int)
df = df.drop(columns=['channel_name', 'ext_service_name'])
df = pd.concat([df, encoded_channels, encoded_services], axis=1)


In [68]:
from sklearn.preprocessing import LabelEncoder

# Label encode the advertiser_name feature
le = LabelEncoder()
df['advertiser_name'] = le.fit_transform(df['advertiser_name'])

In [69]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# Example of your df after label encoding
advertiser_ids = df['advertiser_name'].values

# Embedding parameters
num_advertisers = len(df['advertiser_name'].unique())  # Number of unique advertisers
embedding_size = 10  # Dimensionality of the embedding

# Define the model with an embedding layer
advertiser_input = Input(shape=(1,), dtype='int32')
embedding_layer = Embedding(input_dim=num_advertisers, output_dim=embedding_size)(advertiser_input)
flattened = Flatten()(embedding_layer)
output = Dense(1, activation='linear')(flattened)  # Example output layer (you can replace with your target)

# Compile the model
model = Model(inputs=advertiser_input, outputs=output)
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model (assuming you have a target variable)
target = np.random.rand(len(advertiser_ids))  # Replace with actual target (e.g., clicks or impressions)
model.fit(advertiser_ids, target, epochs=10, batch_size=32)

# Extract learned embeddings
embeddings = model.layers[1].get_weights()[0]  # Extract embeddings from the embedding layer

# Save embeddings (for future use)
np.save('../Datasets/advertiser_embeddings.npy', embeddings)


Epoch 1/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.1037
Epoch 2/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0839
Epoch 3/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0836
Epoch 4/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0841
Epoch 5/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0842
Epoch 6/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0839
Epoch 7/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0839
Epoch 8/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0844
Epoch 9/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0835
Epoch 10/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [70]:
from sklearn.preprocessing import LabelEncoder

# Label encode the keywords feature
le = LabelEncoder()
df['keywords'] = le.fit_transform(df['keywords'])

In [71]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# Example of your df after label encoding
keyword_ids = df['keywords'].values

# Embedding parameters
num_keywords = len(df['keywords'].unique())  # Number of unique advertisers
embedding_size = 10  # Dimensionality of the embedding

# Define the model with an embedding layer
advertiser_input = Input(shape=(1,), dtype='int32')
embedding_layer = Embedding(input_dim=num_advertisers, output_dim=embedding_size)(advertiser_input)
flattened = Flatten()(embedding_layer)
output = Dense(1, activation='linear')(flattened)  # Example output layer (you can replace with your target)

# Compile the model
model = Model(inputs=advertiser_input, outputs=output)
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model (assuming you have a target variable)
target = np.random.rand(len(advertiser_ids))  # Replace with actual target (e.g., clicks or impressions)
model.fit(advertiser_ids, target, epochs=10, batch_size=32)

# Extract learned embeddings
embeddings_key = model.layers[1].get_weights()[0]  # Extract embeddings from the embedding layer

# Save embeddings (for future use)
np.save('../Datasets/keywords_embeddings.npy', embeddings_key)


Epoch 1/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.1019
Epoch 2/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0832
Epoch 3/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0832
Epoch 4/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0833
Epoch 5/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0831
Epoch 6/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0838
Epoch 7/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.0840
Epoch 8/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.0840
Epoch 9/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0836
Epoch 10/10
[1m2163/2163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [76]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print("Categorical Columns:")
print(categorical_columns)

Categorical Columns:
Index(['search_tags', 'landing_page'], dtype='object')


In [77]:
print(df['search_tags'].unique())

['#The Power of X' '#Be Bold. Be X' '#Embrace Your Individuality with X'
 '#The Ultimate Fashion Statement with X'
 '#The X Factor - Fashion for the Fearless' '#Timeless X Style']


In [None]:
# One-hot encoding 'week_cat' with column names exactly matching unique values
encoded_week_cat = pd.get_dummies(df['weekday_cat'], prefix='', prefix_sep='').astype(int)

# Adding the encoded columns to the DataFrame
df = pd.concat([df, encoded_week_cat], axis=1)

# Optionally drop the original 'weekday_cat' column if no longer needed
df.drop('weekday_cat', axis=1, inplace=True)

# Display the updated DataFrame
print(df.head())


   campaign_item_id  advertiser_id  advertiser_name  ext_service_id  \
0              2733           4756               37             128   
1              2733           4756               37              16   
2              2733           4756               37             128   
3              2733           4756               37             128   
4              2733           4756               37               4   

   creative_id                         search_tags  template_id  channel_id  \
0         1000                     #The Power of X         90.0          32   
1         1000                      #Be Bold. Be X         90.0           8   
2         1000  #Embrace Your Individuality with X         90.0           8   
3         1000                      #Be Bold. Be X         90.0          64   
4         1000                      #Be Bold. Be X         90.0          32   

   campaign_budget_usd  media_cost_usd  ...  channel_Display  channel_Mobile  \
0           652.17