In [131]:
import pandas as pd
import numpy as np

In [132]:
df = pd.read_csv('../Datasets/Marketing campaign dataset.csv')

In [133]:
df = df.drop(columns=['total_reach','unique_reach','position_in_content',
                      'max_bid_cpm','network_margin','approved_budget',
                      'exchange_rate','stats_currency','advertiser_currency',
                      'cmi_currency_code','landing_page', 'time', 'creative_id',
                      'ext_service_id','template_id','advertiser_id', 'network_id',
                      'channel_id'])

In [134]:
df['creative_dimension'] = df['creative_height'] * df['creative_width']
df = df.drop(columns=['creative_height','creative_width'])
# Remove rows where 'creative_dimension' is null (NaN)
df = df.dropna(subset=['creative_dimension'])
# Convert 'creative_dimension' to a binary feature
df['has_image'] = df['creative_dimension'].apply(lambda x: "yes" if x == 75000 else "no")

# Check the transformation
print(df['has_image'].value_counts())

has_image
yes    61775
no      7425
Name: count, dtype: int64


In [135]:
# Drop rows with any null values
df = df.dropna()

In [136]:
import torch
import torch.nn as nn

# Map each unique campaign_item_id to an index starting from 0
id_mapping = {id_: idx for idx, id_ in enumerate(df['campaign_item_id'].unique())}
df['campaign_index'] = df['campaign_item_id'].map(id_mapping)

# Number of unique campaigns
num_campaigns = len(id_mapping)  # Total unique campaign IDs
embedding_dim = 16  # Dimensionality of the embedding vectors

# Create an embedding layer
campaign_embedding = nn.Embedding(num_embeddings=num_campaigns, embedding_dim=embedding_dim)

# Convert campaign_index to tensor
campaign_indices = torch.tensor(df['campaign_index'].values, dtype=torch.long)

# Get the embeddings
embedded_campaigns = campaign_embedding(campaign_indices)
print("Embedding Shape:", embedded_campaigns.shape)  # Should match (num_samples, embedding_dim)

# Print the mapping dictionary
print("Campaign ID to Index Mapping:")
for campaign_id, campaign_index in id_mapping.items():
    print(f"Campaign ID: {campaign_id} -> Campaign Index: {campaign_index}")

Embedding Shape: torch.Size([69200, 16])
Campaign ID to Index Mapping:
Campaign ID: 2733 -> Campaign Index: 0
Campaign ID: 2766 -> Campaign Index: 1
Campaign ID: 2802 -> Campaign Index: 2
Campaign ID: 2850 -> Campaign Index: 3
Campaign ID: 2851 -> Campaign Index: 4
Campaign ID: 2879 -> Campaign Index: 5
Campaign ID: 3223 -> Campaign Index: 6
Campaign ID: 3054 -> Campaign Index: 7
Campaign ID: 3132 -> Campaign Index: 8
Campaign ID: 3150 -> Campaign Index: 9
Campaign ID: 3149 -> Campaign Index: 10
Campaign ID: 3157 -> Campaign Index: 11
Campaign ID: 3153 -> Campaign Index: 12
Campaign ID: 3154 -> Campaign Index: 13
Campaign ID: 3222 -> Campaign Index: 14
Campaign ID: 3220 -> Campaign Index: 15
Campaign ID: 3221 -> Campaign Index: 16
Campaign ID: 3219 -> Campaign Index: 17
Campaign ID: 3173 -> Campaign Index: 18
Campaign ID: 3166 -> Campaign Index: 19
Campaign ID: 3164 -> Campaign Index: 20
Campaign ID: 3165 -> Campaign Index: 21
Campaign ID: 2934 -> Campaign Index: 22
Campaign ID: 2949 -

In [137]:
df["platform"] = df["ext_service_name"]

In [138]:
# The corresponding click values for each record
click_values = df['clicks'].values

# Ensure shapes are appropriate
print(f"clicks shape: {click_values.shape}")

clicks shape: (69200,)


In [139]:
from sklearn.preprocessing import MinMaxScaler


# Calculate the 95th percentile threshold
threshold = np.percentile(click_values, 85)
print(f"95th Percentile Threshold: {threshold}")
# Keep only the rows where clicks are below the threshold
df = df[df['clicks'] <= threshold]
df['ctr'] = df['clicks'] / (df['impressions'] + 1)

scaler = MinMaxScaler()
df['engagement_score'] = scaler.fit_transform(df[['ctr']])


95th Percentile Threshold: 19.0


In [140]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 58930 entries, 0 to 69199
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   campaign_item_id     58930 non-null  int64  
 1   no_of_days           58930 non-null  int64  
 2   ext_service_name     58930 non-null  object 
 3   search_tags          58930 non-null  object 
 4   advertiser_name      58930 non-null  object 
 5   channel_name         58930 non-null  object 
 6   campaign_budget_usd  58930 non-null  float64
 7   impressions          58930 non-null  int64  
 8   clicks               58930 non-null  int64  
 9   currency_code        58930 non-null  object 
 10  media_cost_usd       58930 non-null  float64
 11  search_tag_cat       58930 non-null  object 
 12  timezone             58930 non-null  object 
 13  weekday_cat          58930 non-null  object 
 14  keywords             58930 non-null  object 
 15  creative_dimension   58930 non-null  floa

In [141]:
# Convert engagement scores to percentages and round to 2 decimal places
df['engagement_percentage'] = (df['engagement_score'] * 100).round(2)

# Print results
print(df[['campaign_index', 'engagement_percentage']])

       campaign_index  engagement_percentage
0                   0                  29.44
5                   1                  42.92
6                   1                  66.76
7                   1                  48.56
8                   1                  26.80
...               ...                    ...
69194             140                  47.51
69195             140                  28.98
69196             140                  16.63
69198             140                  29.30
69199             140                  14.24

[58930 rows x 2 columns]


In [142]:
df = df.drop(columns=['campaign_item_id','ctr','weekday_cat',
                      'creative_dimension','media_cost_usd','clicks','impressions',
                      'ext_service_name', 'engagement_score'])

In [143]:
# Save the final DataFrame to a CSV file
final_csv_path = '../Datasets/Database_data.csv'
df.to_csv(final_csv_path, index=False)

print(f"Final DataFrame saved to {final_csv_path}")

Final DataFrame saved to ../Datasets/Database_data.csv


In [144]:
from pymongo import MongoClient

# Convert DataFrame to a list of dictionaries (JSON format)
data = df.to_dict(orient="records")

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Customer_Engagement"]  # Database name
collection = db["Prediction_details"]  # Collection name

# Insert data into MongoDB
collection.insert_many(data)

print("Data inserted into MongoDB successfully!")

Data inserted into MongoDB successfully!
