### Importing relevent libraries


In [100]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer




In [101]:
# Load the dataset
data = pd.read_csv("../Datasets/Marketing campaign dataset.csv")
# Display the DataFrame
print(data)


       campaign_item_id  no_of_days        time  ext_service_id  \
0                  2733           7  2022-05-01             128   
1                  2733           8  2022-05-02              16   
2                  2733           9  2022-05-03             128   
3                  2733          10  2022-05-04             128   
4                  2733          11  2022-05-05               4   
...                 ...         ...         ...             ...   
72607              3567          11  2022-11-28               4   
72608              3567          12  2022-11-29             128   
72609              3567          13  2022-11-30             128   
72610              3567          14  2022-12-01               4   
72611              3567          15  2022-12-02              16   

      ext_service_name  creative_id  creative_width  creative_height  \
0         Facebook Ads         1000           300.0            250.0   
1                DV360         1000           300.0

In [102]:
print(data.dtypes)

campaign_item_id         int64
no_of_days               int64
time                    object
ext_service_id           int64
ext_service_name        object
creative_id              int64
creative_width         float64
creative_height        float64
search_tags             object
template_id            float64
landing_page            object
advertiser_id            int64
advertiser_name         object
network_id               int64
approved_budget        float64
advertiser_currency     object
channel_id               int64
channel_name            object
max_bid_cpm            float64
network_margin         float64
campaign_budget_usd    float64
impressions              int64
clicks                   int64
stats_currency          object
currency_code           object
exchange_rate            int64
media_cost_usd         float64
position_in_content    float64
unique_reach           float64
total_reach            float64
search_tag_cat          object
cmi_currency_code       object
timezone

In [103]:
# Select relevant features
selected_columns = [
    'campaign_item_id', 'advertiser_id', 'advertiser_name', 'creative_id', 'creative_width', 
    'creative_height', 'search_tags', 'template_id', 'channel_id', 'channel_name', 
    'campaign_budget_usd', 'impressions', 'clicks', 'media_cost_usd', 'weekday_cat', 
    'keywords', 'time', 'ext_service_name'
]

# Select the relevant columns
data = data[selected_columns]

# Convert 'time' to datetime format
data['time'] = pd.to_datetime(data['time'])

In [104]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_columns = ['ext_service_name', 'channel_name', 'weekday_cat', 'search_tags', 'advertiser_name', 'keywords']

# Loop through the categorical columns and encode them
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])


# Display the DataFrame after encoding
print(data.head())


   campaign_item_id  advertiser_id  advertiser_name  creative_id  \
0              2733           4756               42         1000   
1              2733           4756               42         1000   
2              2733           4756               42         1000   
3              2733           4756               42         1000   
4              2733           4756               42         1000   

   creative_width  creative_height  search_tags  template_id  channel_id  \
0           300.0            250.0            2         90.0          32   
1           300.0            250.0            0         90.0           8   
2           300.0            250.0            1         90.0           8   
3           300.0            250.0            0         90.0          64   
4           300.0            250.0            0         90.0          32   

   channel_name  campaign_budget_usd  impressions  clicks  media_cost_usd  \
0             1           652.173913          837       8

In [105]:
data['creative_dimension'] = data['creative_height'] * data['creative_width']

data = data.drop(columns=['creative_height', 'creative_width'])


In [106]:
# Check for null values in the dataset
null_values = data.isnull().sum()

# Display columns with null values (if any)
print("Null values in each column:\n", null_values)

# To check if there are any rows with missing values in the entire dataset
print("Total rows with any missing values:", data.isnull().any(axis=1).sum())


Null values in each column:
 campaign_item_id          0
advertiser_id             0
advertiser_name           0
creative_id               0
search_tags               0
template_id            3412
channel_id                0
channel_name              0
campaign_budget_usd       0
impressions               0
clicks                    0
media_cost_usd            0
weekday_cat               0
keywords                  0
time                      0
ext_service_name          0
creative_dimension     3412
dtype: int64
Total rows with any missing values: 3412


In [107]:
# Fill null values in specific columns with 0
columns_to_fill = ['creative_dimension']
data[columns_to_fill] = data[columns_to_fill].fillna(0)

# Verify the changes
print(data[columns_to_fill].isnull().sum())  # Should show 0 for all these columns
print(data.head())  # To see the updated dataset

creative_dimension    0
dtype: int64
   campaign_item_id  advertiser_id  advertiser_name  creative_id  search_tags  \
0              2733           4756               42         1000            2   
1              2733           4756               42         1000            0   
2              2733           4756               42         1000            1   
3              2733           4756               42         1000            0   
4              2733           4756               42         1000            0   

   template_id  channel_id  channel_name  campaign_budget_usd  impressions  \
0         90.0          32             1           652.173913          837   
1         90.0           8             3           652.173913         2634   
2         90.0           8             3           652.173913         2135   
3         90.0          64             4           652.173913         2327   
4         90.0          32             1           652.173913         1538   

   clic

In [108]:
data.describe()

Unnamed: 0,campaign_item_id,advertiser_id,advertiser_name,creative_id,search_tags,template_id,channel_id,channel_name,campaign_budget_usd,impressions,clicks,media_cost_usd,weekday_cat,keywords,time,ext_service_name,creative_dimension
count,72612.0,72612.0,72612.0,72612.0,72612.0,69200.0,72612.0,72612.0,72612.0,72612.0,72612.0,72612.0,72612.0,72612.0,72612,72612.0,72612.0
mean,3130.143282,6195.862213,23.330882,7450.124842,2.48828,83.082659,21.84778,2.001474,617.630284,1370.161847,52.696386,11.819999,0.140307,58.433537,2022-09-26 18:40:05.949429760,0.999931,63806.602215
min,2733.0,4756.0,0.0,1000.0,0.0,23.0,1.0,0.0,2.452316,511.0,2.0,0.0,0.0,0.0,2022-05-01 00:00:00,0.0,0.0
25%,3148.0,6319.0,17.0,3725.0,1.0,90.0,4.0,1.0,205.99455,526.0,5.0,0.435791,0.0,29.0,2022-08-20 00:00:00,0.0,75000.0
50%,3173.0,6385.0,25.0,7855.0,2.0,90.0,8.0,2.0,377.656676,577.0,8.0,1.709001,0.0,58.0,2022-10-11 00:00:00,1.0,75000.0
75%,3202.0,6394.0,33.0,10995.0,4.0,90.0,32.0,3.0,755.313351,816.0,13.0,8.276369,0.0,88.0,2022-11-05 00:00:00,2.0,75000.0
max,3960.0,6490.0,43.0,15605.0,5.0,93.0,64.0,4.0,39559.896155,153959.0,31807.0,2295.028945,1.0,117.0,2022-12-10 00:00:00,2.0,75000.0
std,142.154918,387.864576,11.997667,4062.384982,1.70249,20.847618,23.759286,1.413565,1354.606619,3702.699962,377.308103,48.370659,0.347308,34.024005,,0.815903,26724.941835


In [109]:
data["template_id"].describe()
data["template_id"].isnull().sum()  # Count of missing values
print(data["template_id"].unique())


[90. 23. 92. 89. 93. nan]


In [110]:
data["template_id"] = data["template_id"].fillna(data["template_id"].mean())

In [111]:
data.to_csv("../Datasets/Dataset.csv",  index=False)