In [1]:
import pandas as pd
import numpy as np

In [2]:
advert = pd.read_csv("data/advert_categories.csv", index_col=[0])
advert.head(2)

Unnamed: 0,type,social_media,categories,sales_performance,positive_impact,social_vs_sales
0,Electronics & Gadgets,"Facebook, Tik-Tok",Home Appliances,"Yes, a significant increase",Strongly agree,"Yes, high engagement leads to more sales"
1,Clothing & Apparel,"Facebook, Instagram, Twitter/X, Tik-Tok",Clothing & Apparel,"Yes, a significant increase",Strongly agree,"Yes, high engagement leads to more sales"


# Setting the social media types

In [3]:
# cleaning the social media column
advert["social_media"] = advert["social_media"].str.replace("-", "")
advert["social_media"] = advert["social_media"].str.replace("/X", "")

In [4]:
# available social media platforms
social_media = ["Facebook", "Instagram", "Twitter", "LinkedIn", 'WhatsApp', "TikTok", "Telegram", "Snapchat"]

In [5]:
def get_ads(col):
    vals = list(col)
    ads_dict = {i: [] for i in social_media}
    for i in vals:
        ads_list = i.split(", ")
        for ads in ads_dict:
            ads_dict[ads].append(len(set(ads_list) & {ads}))
    return ads_dict

In [6]:
# getting the distinct ads and converting to dataframe
ads_dict = get_ads(advert["social_media"])
ads_df = pd.DataFrame(ads_dict)

In [7]:
# merging both dataframes
new_advert = pd.merge(advert, ads_df, left_index=True, right_index=True)
new_advert.head(2)

Unnamed: 0,type,social_media,categories,sales_performance,positive_impact,social_vs_sales,Facebook,Instagram,Twitter,LinkedIn,WhatsApp,TikTok,Telegram,Snapchat
0,Electronics & Gadgets,"Facebook, TikTok",Home Appliances,"Yes, a significant increase",Strongly agree,"Yes, high engagement leads to more sales",1,0,0,0,0,1,0,0
1,Clothing & Apparel,"Facebook, Instagram, Twitter, TikTok",Clothing & Apparel,"Yes, a significant increase",Strongly agree,"Yes, high engagement leads to more sales",1,1,1,0,0,1,0,0


# Working on the dependent variable

In [8]:
# ranking the sales_performance column
sales_performance_rank = ['Sales decrease', 'No, noticeable change', 'Yes, a slight increase', 'Yes, a significant increase']
new_advert["sales_performance"] = pd.Categorical(new_advert["sales_performance"], categories=sales_performance_rank, ordered=True)

In [9]:
# ranking the positive_impact column
positive_impact_rank = ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly agree']
new_advert["positive_impact"] = pd.Categorical(new_advert["positive_impact"], categories=positive_impact_rank, ordered=True)

In [10]:
# ranking the social_vs_sales column
social_vs_sales_rank = ['No noticeable impact', 'Sometimes, but not always', 'Yes, high engagement leads to more sales']
new_advert["social_vs_sales"] = pd.Categorical(new_advert["social_vs_sales"], categories=social_vs_sales_rank, ordered=True)

In [11]:
# the dependent varible sales_score
new_advert['sales_score'] = new_advert["positive_impact"].cat.codes + new_advert["social_vs_sales"].cat.codes + new_advert["sales_performance"].cat.codes

In [12]:
# my new dataframe for analysis
new_advert.drop(columns=['social_media', 'type', 'sales_performance', 'positive_impact', 'social_vs_sales'], inplace=True)

In [13]:
new_advert.head(3)

Unnamed: 0,categories,Facebook,Instagram,Twitter,LinkedIn,WhatsApp,TikTok,Telegram,Snapchat,sales_score
0,Home Appliances,1,0,0,0,0,1,0,0,9
1,Clothing & Apparel,1,1,1,0,0,1,0,0,9
2,Home Appliances,0,0,0,0,0,1,0,0,6


In [14]:
new_advert.to_csv("data/clean_advert.csv")