In [2]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import re
from tqdm import tqdm
from typing import Optional, Any, List, Tuple, Dict

In [3]:
# Keywords request response
requests_df = pd.read_csv("keywords_req&resp_163519(22-28July).csv")
requests_df.shape

(930730, 6)

In [4]:
requests_df.head()

Unnamed: 0,trimmed_keyword,keywords,word_cnt,request,response,RR
0,lipstick,lipstick,1,213661,212057,99.249278
1,alpsgoodnessrosemarywaterrefillpack,alps goodness rosemary water refill pack,6,98715,24434,24.752064
2,sunscreen,sunscreen,1,86398,86398,100.0
3,facewash,face wash,2,76316,76004,99.591174
4,foundation,foundation,1,61821,61821,100.0


In [5]:
# Keyword campaign level data
campaign_keywords = pd.read_csv("targetted_keywords_in_campaign_29July_163519.csv")
campaign_keywords.shape

(67335, 11)

In [6]:
# Cleaning & filtering the Data

# Step 1: Filter the data where is_negative is 'False'
filtered_df = campaign_keywords[campaign_keywords['is_negative'] == False]

# Step 2: Filter the data where status_type is 'ACTIVE'
filtered_df = filtered_df[filtered_df['status_type'] == 'ACTIVE']

# Step 3: Group by 'marketing_campaign_id' and 'text'
grouped_df = filtered_df.groupby(['keyword_id','marketing_campaign_id', 'text', 'match_type'], as_index=False).first()

# Step 4: Drop duplicate combinations (this is automatically handled by the groupby and first combination)
cleaned_camp_key_df = grouped_df.drop_duplicates(subset=['keyword_id','marketing_campaign_id', 'text', 'match_type'])

In [7]:
cleaned_camp_key_df.head()

Unnamed: 0,keyword_id,marketing_campaign_id,text,match_type,marketplace_client_id,agency_id,is_negative,status_type,bidding_value,bidding_currency,bidding_value_usd
0,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059
1,135197,375534,face mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
2,135198,375534,korean mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
3,135199,375534,mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
4,389279,551819,Pilgrim Serum,BROAD,163519,122,False,ACTIVE,30.0,INR,0.4059


In [8]:
cleaned_camp_key_df.shape

(44974, 11)

In [9]:
campaign_keywords[campaign_keywords['status_type'] == "ACTIVE"]

Unnamed: 0,marketplace_client_id,agency_id,keyword_id,marketing_campaign_id,text,is_negative,match_type,status_type,bidding_value,bidding_currency,bidding_value_usd
0,163519,122,1268001,245954,face wash,True,EXACT,ACTIVE,0.00,,0.0000
1,163519,122,1268003,245954,vitamin c face wash,True,EXACT,ACTIVE,0.00,,0.0000
2,163519,122,1268004,245954,face scrub,True,EXACT,ACTIVE,0.00,,0.0000
3,163519,122,1268002,245954,face wash for oily skin,True,EXACT,ACTIVE,0.00,,0.0000
17,163519,122,1265600,290230,best body wash for glowing skin,True,EXACT,ACTIVE,0.00,,0.0000
...,...,...,...,...,...,...,...,...,...,...,...
67330,163519,122,769142,668804,best under eye patches,False,EXACT,ACTIVE,16.67,INR,0.2256
67331,163519,122,769145,668804,eye patches for dark circles,False,EXACT,ACTIVE,16.67,INR,0.2256
67332,163519,122,756625,668804,dark circle remover under eye patch,False,EXACT,ACTIVE,16.67,INR,0.2256
67333,163519,122,769158,668804,cooling eye patches,False,EXACT,ACTIVE,16.67,INR,0.2256


### Subset the data for keyword & match type 

In [10]:
keyword_match_df = cleaned_camp_key_df[['text', 'match_type']]
keyword_match_df = keyword_match_df.drop_duplicates(subset=['text', 'match_type'])
keyword_match_df.shape

(31508, 2)

In [11]:
keyword_match_df.head()

Unnamed: 0,text,match_type
0,sheet mask,PHRASE
1,face mask,PHRASE
2,korean mask,PHRASE
3,mask,PHRASE
4,Pilgrim Serum,BROAD


In [12]:
len(keyword_match_df['text'].unique())

23865

In [13]:
# Filter out EXACT & Phrase match type keywords
exact_keywords_df = keyword_match_df[keyword_match_df['match_type']=="EXACT"]
phrase_keywords_df = keyword_match_df[keyword_match_df['match_type']=="PHRASE"]
broad_keywords_df = keyword_match_df[keyword_match_df['match_type']=="BROAD"]

In [14]:
print("Exact: ", exact_keywords_df.shape)
print("Phrase: ", phrase_keywords_df.shape)
print("Broad: ", broad_keywords_df.shape)  # broad 584, phrase 7, exact 413

Exact:  (16084, 2)
Phrase:  (7571, 2)
Broad:  (7853, 2)


In [15]:
broad_keywords_df.head()

Unnamed: 0,text,match_type
4,Pilgrim Serum,BROAD
14,the Ordinary,BROAD
15,COSRX,BROAD
325,hair color,BROAD
326,garnier hair colour,BROAD


### Exact Matching Handling

In [16]:
# For Exact we have to direct map the request volume
exact_keywords_req_df = pd.merge(exact_keywords_df, requests_df[['keywords', 'request', 'response', 'RR']], left_on='text', right_on='keywords', how='left')
print(exact_keywords_req_df.isnull().sum())
# exact_keywords_req_df = exact_keywords_req_df.drop('keywords', axis=1)
exact_keywords_req_df = exact_keywords_req_df.fillna(0)

text             0
match_type       0
keywords      4816
request       4816
response      4816
RR            4816
dtype: int64


In [17]:
exact_keywords_req_df.head(20)

Unnamed: 0,text,match_type,keywords,request,response,RR
0,Minimalist,EXACT,0,0.0,0.0,0.0
1,Plum,EXACT,0,0.0,0.0,0.0
2,Serum,EXACT,0,0.0,0.0,0.0
3,Plum Serum,EXACT,0,0.0,0.0,0.0
4,henna powder,EXACT,henna powder,1499.0,1499.0,100.0
5,mascara,EXACT,mascara,19388.0,19334.0,99.721477
6,mascara waterproof,EXACT,mascara waterproof,17487.0,17485.0,99.988563
7,mascara for women,EXACT,mascara for women,12.0,12.0,100.0
8,maskara for eye waterproof,EXACT,maskara for eye waterproof,5.0,0.0,0.0
9,lipstick,EXACT,lipstick,213661.0,212057.0,99.249278


In [18]:
# Add one extra column as veriations, here we're renaming keyword to variations
exact_keywords_req_df.rename(columns={'keywords':'keyword_variation'}, inplace=True)

In [19]:
exact_keywords_req_df.tail()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
16079,dermatouch kojic acid serum,EXACT,dermatouch kojic acid serum,133.0,126.0,94.736842
16080,simple hydrating light moisturizer,EXACT,simple hydrating light moisturizer,140.0,136.0,97.142857
16081,ponds hyaluronic acid serum,EXACT,ponds hyaluronic acid serum,94.0,92.0,97.87234
16082,Mist,EXACT,0,0.0,0.0,0.0
16083,Body Mist,EXACT,0,0.0,0.0,0.0


In [20]:
exact_keywords_req_df.shape

(16084, 6)

### Phrase Matching Handling

In [32]:
# Handling the phrase variations by Extracting the phrase variation of phrase matched targetted keywords

# Function for the manipulations on the targeted keywords
def targeted_keywords_manipulation(targeted_keywords) -> pd.DataFrame:
    targeted_keywords["lower_tar_keyword"] = (targeted_keywords["text"].str.lower().str.strip())
    targeted_keywords = targeted_keywords.loc[(~(targeted_keywords["lower_tar_keyword"].isna())) & (targeted_keywords["lower_tar_keyword"] != "")]    
    return targeted_keywords.lower_tar_keyword

# Function for the manipulations on the search queries
def search_query_manipulation(search_queries)->Tuple[pd.DataFrame, List[str]]:
    search_queries["lower_search_keyword"] = search_queries["keywords"].str.lower().str.strip()
    search_query_df_v2 = (
        search_queries.groupby(["trimmed_keyword", "lower_search_keyword", "word_cnt"])
                                .agg({"request": "sum", "response": "sum"})
                                .reset_index()
                                .sort_values(by="request", ascending=False)
                                )

    phrase_df = search_query_df_v2.copy()
    phrase_df = phrase_df.rename(
                                columns={
                                    "lower_search_keyword": "phrase_keyword",
                                    "request": "phrase_request",
                                    "response": "phrase_response_cnt",
                                }
                                ).drop(columns=["trimmed_keyword", "word_cnt"])
    
    # Create a list with phrase candidates
    phrase_set_lst = phrase_df["phrase_keyword"].tolist()
    return phrase_df, phrase_set_lst

def find_phrase_variations(targetted_keywords: List[str], search_query: List[str], score_cutoff=100):
    targeted_query_len = len(targetted_keywords)
    
    # Filter phrases that contain the targeted keywords and are of equal or greater length
    filtered_phrase_set = [query for query in search_query if len(query) >= targeted_query_len and targetted_keywords in query]
    
    matches = process.extract(targetted_keywords, filtered_phrase_set, scorer=fuzz.token_set_ratio, limit=None)
    score_filtered_lst = [key for key, score, _ in matches if score == score_cutoff]
    
    return score_filtered_lst

In [33]:
cleaned_phrase_keywords = targeted_keywords_manipulation(phrase_keywords_df)
cleaned_phrase_keywords = pd.DataFrame(phrase_keywords_df) # dataset should be in dataframe
cleaned_phrase_keywords.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeted_keywords["lower_tar_keyword"] = (targeted_keywords["text"].str.lower().str.strip())


Unnamed: 0,text,match_type,lower_tar_keyword
0,sheet mask,PHRASE,sheet mask
1,face mask,PHRASE,face mask
2,korean mask,PHRASE,korean mask
3,mask,PHRASE,mask
5,Face Serum,PHRASE,face serum


In [34]:
cleaned_phrase_keywords.shape

(7571, 3)

In [35]:
phrase_df, phrase_set_lst = search_query_manipulation(requests_df)
phrase_set_lst   # This has to be in series

['lipstick',
 'alps goodness rosemary water refill pack',
 'sunscreen',
 'face wash',
 'foundation',
 'dermdoc face serum',
 'serum',
 'lip balm',
 'dermdoc underarm darkness',
 'alps goodness rose water',
 'shampoo',
 'toner',
 'rosemary',
 'hair serum',
 'rosemary water',
 'concealer',
 'face pack',
 'eyeliner',
 'body lotion',
 'body wash',
 'good vibes face wash',
 'eyeshadow',
 'maybelline lipstick',
 'primer',
 'ny bae pro strobe cream 3 in 1 foundation',
 'faces canada foundation',
 'cetaphil cleanser',
 'lip gloss',
 'mars lipstick',
 'nail polish',
 'alps goodness rosemary water',
 'hair mask',
 'moisturizer for face',
 'mama earth face wash',
 'kajal',
 'eye liner waterproof',
 'night cream',
 'vitamin c face serum',
 'compact',
 'skincare',
 'rose water',
 'dot and key sunscreen',
 'simple face wash',
 'loreal shampoo',
 'face moisturizer',
 'face serum',
 'dermdoc',
 'niacinamide serum',
 'conditioner',
 'mascara',
 'highlighter',
 'swiss beauty lipstick',
 'kajal waterproo

In [36]:
# Finding Sub-queries
tqdm.pandas(desc="Finding the subqueries")
cleaned_phrase_keywords.loc[:, "matching_targets"] = cleaned_phrase_keywords["lower_tar_keyword"].progress_apply(lambda x: find_phrase_variations(x, phrase_set_lst))
cleaned_phrase_keywords

Finding the subqueries: 100%|██████████████████████████████████████████████████████| 7571/7571 [37:30<00:00,  3.36it/s]


Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets
0,sheet mask,PHRASE,sheet mask,"[sheet mask, garnier sheet mask, good vibes sh..."
1,face mask,PHRASE,face mask,"[face mask, good vibes face mask, korean face ..."
2,korean mask,PHRASE,korean mask,"[korean mask, tomato korean mask, korean mask ..."
3,mask,PHRASE,mask,"[hair mask, sheet mask, face mask, golden peel..."
5,Face Serum,PHRASE,face serum,"[dermdoc face serum, vitamin c face serum, fac..."
...,...,...,...,...
44851,Insight Foundation,PHRASE,insight foundation,"[insight foundation, insight foundation hd, mn..."
44852,Insight Cosmetics foundation,PHRASE,insight cosmetics foundation,"[insight cosmetics foundation, insight cosmeti..."
44853,Foundation,PHRASE,foundation,"[foundation, ny bae pro strobe cream 3 in 1 fo..."
44971,Mist,PHRASE,mist,"[plum body mist, body mist, face mist, aqualog..."


In [45]:
# This line creates a new DataFrame filtered_df by filtering rows from cleaned_phrase_keywords where the matching_targets column is empty.
filtered_df = cleaned_phrase_keywords[cleaned_phrase_keywords["matching_targets"].apply(len) == 0]

phrase_matched_keywords = cleaned_phrase_keywords.explode("matching_targets")
print("Pre Shape: ", phrase_matched_keywords.shape)
phrase_matched_keywords = phrase_matched_keywords.drop_duplicates(subset=['lower_tar_keyword', 'matching_targets'])
print("Post Shape: ", phrase_matched_keywords.shape)

Pre Shape:  (1619590, 4)
Post Shape:  (1241946, 4)


In [46]:
filtered_df.shape

(1560, 4)

In [47]:
phrase_matched_keywords.head()

Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets
0,sheet mask,PHRASE,sheet mask,sheet mask
0,sheet mask,PHRASE,sheet mask,garnier sheet mask
0,sheet mask,PHRASE,sheet mask,good vibes sheet mask
0,sheet mask,PHRASE,sheet mask,sheet mask combo
0,sheet mask,PHRASE,sheet mask,lakme face sheet mask


In [48]:
# Adding request volume to phrase_matched_keywords
phrase_matched_keywords = pd.merge(phrase_matched_keywords, phrase_df[["phrase_keyword", "phrase_request", "phrase_response_cnt"]], how="inner", left_on="matching_targets", right_on="phrase_keyword")
phrase_matched_keywords = phrase_matched_keywords.drop('phrase_keyword', axis=1)
phrase_matched_keywords['phrase_response_rate'] = phrase_matched_keywords['phrase_response_cnt']*100/ phrase_matched_keywords['phrase_request']
phrase_matched_keywords.head()

Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets,phrase_request,phrase_response_cnt,phrase_response_rate
0,sheet mask,PHRASE,sheet mask,sheet mask,14900,14854,99.691275
1,mask,PHRASE,mask,sheet mask,14900,14854,99.691275
2,sheet mask,PHRASE,sheet mask,garnier sheet mask,1069,406,37.97942
3,mask,PHRASE,mask,garnier sheet mask,1069,406,37.97942
4,garnier,PHRASE,garnier,garnier sheet mask,1069,406,37.97942


In [49]:
phrase_matched_keywords[phrase_matched_keywords['text']=='bathing powder']

Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets,phrase_request,phrase_response_cnt,phrase_response_rate
810011,bathing powder,PHRASE,bathing powder,mamaearth bathing powder,4,0,0.0
842312,bathing powder,PHRASE,bathing powder,good vibes bathing powder,38,34,89.473684
938124,bathing powder,PHRASE,bathing powder,alps goodness bathing powder,2,2,100.0
953633,bathing powder,PHRASE,bathing powder,body bathing powder,2,0,0.0
956318,bathing powder,PHRASE,bathing powder,bathing powder body whitening,1,0,0.0
1225079,bathing powder,PHRASE,bathing powder,bathing powder,21,0,0.0
1225226,bathing powder,PHRASE,bathing powder,herbal bathing powder,15,2,13.333333
1227040,bathing powder,PHRASE,bathing powder,baby bathing powder,4,0,0.0
1227182,bathing powder,PHRASE,bathing powder,bathing powder for baby girl,4,0,0.0
1227948,bathing powder,PHRASE,bathing powder,magnum wood girl bathing powder,2,0,0.0


In [50]:
phrase_matched_keywords = phrase_matched_keywords.drop('lower_tar_keyword', axis=1)
phrase_matched_keywords.rename(columns={'matching_targets':'keyword_variation',
                                        'phrase_request':'request',
                                        'phrase_response_cnt':'response',
                                         'phrase_response_rate':'RR'}, inplace=True)

In [51]:
phrase_matched_keywords[phrase_matched_keywords['text']=='bathing powder']

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
810011,bathing powder,PHRASE,mamaearth bathing powder,4,0,0.0
842312,bathing powder,PHRASE,good vibes bathing powder,38,34,89.473684
938124,bathing powder,PHRASE,alps goodness bathing powder,2,2,100.0
953633,bathing powder,PHRASE,body bathing powder,2,0,0.0
956318,bathing powder,PHRASE,bathing powder body whitening,1,0,0.0
1225079,bathing powder,PHRASE,bathing powder,21,0,0.0
1225226,bathing powder,PHRASE,herbal bathing powder,15,2,13.333333
1227040,bathing powder,PHRASE,baby bathing powder,4,0,0.0
1227182,bathing powder,PHRASE,bathing powder for baby girl,4,0,0.0
1227948,bathing powder,PHRASE,magnum wood girl bathing powder,2,0,0.0


In [52]:
phrase_matched_keywords[phrase_matched_keywords['text']=='bathing powder']['request'].sum()

99

In [92]:
phrase_matched_keywords.to_csv("phrase_testing_163519.csv", index=False)

In [54]:
# phrase_keywords_req_df = (phrase_matched_keywords.groupby(['text', 'match_type'])
#                                 .agg({"phrase_request": "sum", "phrase_response_cnt": "sum"})
#                                 .reset_index()
#                                 .sort_values(by="phrase_request", ascending=False))

In [55]:
# phrase_keywords_req_df.head()

In [57]:
# phrase_keywords_req_df.shape

In [58]:
# phrase_keywords_req_df[phrase_keywords_req_df['text']=='heineken']

In [59]:
# # Renaming the phrase keyword request and response columns
# phrase_keywords_req_df = phrase_keywords_req_df.rename(columns={"phrase_request": "request", "phrase_response_cnt": "response"})
# phrase_keywords_req_df['RR'] = phrase_keywords_req_df['response']*100/phrase_keywords_req_df['request']

### Handling Broad match type

In [61]:
broad_keywords_df.head()

Unnamed: 0,text,match_type
4,Pilgrim Serum,BROAD
14,the Ordinary,BROAD
15,COSRX,BROAD
325,hair color,BROAD
326,garnier hair colour,BROAD


In [66]:
broad_keywords_df.shape

(7853, 3)

In [62]:
# Function for the manipulations on the targeted keywords
def targeted_keywords_manipulation(targeted_keywords) -> pd.DataFrame:
    targeted_keywords["lower_tar_keyword"] = (targeted_keywords["text"].str.lower().str.strip())
    targeted_keywords = targeted_keywords.loc[(~(targeted_keywords["lower_tar_keyword"].isna())) & (targeted_keywords["lower_tar_keyword"] != "")]    
    return targeted_keywords.lower_tar_keyword

def find_complete_broad_variations(targeted_query: str, broad_set_lst: List[str], score_cutoff=100):
    # Split the targeted query into individual words
    targeted_query_split = targeted_query.lower().split()
    targeted_query_len = len(targeted_query)

    # Filter phrases that are equal or longer than the targeted query and contain all words from the targeted query
    filtered_broad_set = [
        broad for broad in broad_set_lst
        if len(broad) >= targeted_query_len and all(word in broad.lower() for word in targeted_query_split)
    ]

    # Perform fuzzy matching and filter based on the score cutoff
    matches = process.extract(targeted_query, filtered_broad_set, scorer=fuzz.token_set_ratio, limit=None)
    score_filtered_lst = [key for key, score, _ in matches if score == score_cutoff]
    
    return score_filtered_lst

In [63]:
# #apply filter of word lenth greateer than clilent length 
# def find_complete_broad_variations(targeted_query: List[str], broad_set_lst: List[str], score_cutoff=100):
#     # Split the targeted query into individual words
#     #print("targeted_query:", targeted_query)
#     targeted_query_split = targeted_query.lower().split()
#     #print("targeted_query_split: ", targeted_query_split)
    
#     # Filter phrases that contain all words from the targeted query and perform fuzzy matching
#     filtered_broad_set = [
#         broad for broad in broad_set_lst
#         if all(word in broad.lower() for word in targeted_query_split)
#     ]
#     #print("filtered_broad_set: ", filtered_broad_set)
    
#     # Perform fuzzy matching and filter based on the score cutoff
#     matches = process.extract(targeted_query, filtered_broad_set, scorer=fuzz.token_set_ratio, limit=None)
#     score_filtered_lst = [key for key, score, _ in matches if score == score_cutoff]
    
#     return score_filtered_lst

In [67]:
cleaned_broad_keywords = targeted_keywords_manipulation(broad_keywords_df)
cleaned_broad_keywords = pd.DataFrame(broad_keywords_df) # dataset should be in dataframe
cleaned_broad_keywords.head()

Unnamed: 0,text,match_type,lower_tar_keyword
4,Pilgrim Serum,BROAD,pilgrim serum
14,the Ordinary,BROAD,the ordinary
15,COSRX,BROAD,cosrx
325,hair color,BROAD,hair color
326,garnier hair colour,BROAD,garnier hair colour


In [68]:
# Finding Sub-queries
#broad_set_2 = broad_set.head(5)
tqdm.pandas(desc="Finding the subqueries")
cleaned_broad_keywords.loc[:, "matching_targets"] = cleaned_broad_keywords["lower_tar_keyword"].progress_apply(lambda x: find_complete_broad_variations(x, phrase_set_lst))
cleaned_broad_keywords.head()

Finding the subqueries: 100%|████████████████████████████████████████████████████| 7853/7853 [2:16:21<00:00,  1.04s/it]


Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets
4,Pilgrim Serum,BROAD,pilgrim serum,"[pilgrim hair growth serum, pilgrim serum, pil..."
14,the Ordinary,BROAD,the ordinary,"[the ordinary niacinamide serum, the ordinary ..."
15,COSRX,BROAD,cosrx,"[cosrx snail mucin, cosrx, cosrx advanced snai..."
325,hair color,BROAD,hair color,"[hair color, b blunt hair color, iba halal hai..."
326,garnier hair colour,BROAD,garnier hair colour,"[garnier hair colour, garnier hair colour natu..."


In [80]:
filtered_broad_df = cleaned_broad_keywords[cleaned_broad_keywords["matching_targets"].apply(len) == 0]

broad_matched_keywords = cleaned_broad_keywords.explode("matching_targets")
print("Pre Shape: ", broad_matched_keywords.shape)
broad_matched_keywords = broad_matched_keywords.drop_duplicates(subset=['lower_tar_keyword', 'matching_targets'])
print("Post Shape: ", broad_matched_keywords.shape)

Pre Shape:  (1740890, 4)
Post Shape:  (1649492, 4)


In [81]:
broad_matched_keywords.head()

Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets
4,Pilgrim Serum,BROAD,pilgrim serum,pilgrim hair growth serum
4,Pilgrim Serum,BROAD,pilgrim serum,pilgrim serum
4,Pilgrim Serum,BROAD,pilgrim serum,pilgrim face serum
4,Pilgrim Serum,BROAD,pilgrim serum,pilgrim hair serum
4,Pilgrim Serum,BROAD,pilgrim serum,pilgrim 24k gold serum


In [82]:
phrase_df.head()

Unnamed: 0,phrase_keyword,phrase_request,phrase_response_cnt
496808,lipstick,213661,212057
42435,alps goodness rosemary water refill pack,98715,24434
844631,sunscreen,86398,86398
293207,face wash,76316,76004
316130,foundation,61821,61821


In [83]:
# Adding request volume to phrase_matched_keywords
broad_matched_keywords = pd.merge(broad_matched_keywords, phrase_df[["phrase_keyword", "phrase_request", "phrase_response_cnt"]], how="inner", left_on="matching_targets", right_on="phrase_keyword")
broad_matched_keywords = broad_matched_keywords.drop('phrase_keyword', axis=1)
broad_matched_keywords['phrase_response_rate'] = broad_matched_keywords['phrase_response_cnt']*100/ broad_matched_keywords['phrase_request']
broad_matched_keywords.head()

Unnamed: 0,text,match_type,lower_tar_keyword,matching_targets,phrase_request,phrase_response_cnt,phrase_response_rate
0,Pilgrim Serum,BROAD,pilgrim serum,pilgrim hair growth serum,6228,6022,96.692357
1,hair serum,BROAD,hair serum,pilgrim hair growth serum,6228,6022,96.692357
2,pilgrim hair growth serum,BROAD,pilgrim hair growth serum,pilgrim hair growth serum,6228,6022,96.692357
3,pilgrim hair serum,BROAD,pilgrim hair serum,pilgrim hair growth serum,6228,6022,96.692357
4,hair growth serum,BROAD,hair growth serum,pilgrim hair growth serum,6228,6022,96.692357


In [90]:
broad_matched_keywords[broad_matched_keywords['text']=='hair serum']

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
1,hair serum,BROAD,pilgrim hair growth serum,6228,6022,96.692357
22,hair serum,BROAD,pilgrim hair serum,460,451,98.043478
36,hair serum,BROAD,pilgrim advanced hair growth serum,216,209,96.759259
100,hair serum,BROAD,pilgrim anti grey hair serum,89,89,100.000000
235,hair serum,BROAD,pilgrim serum hair,21,21,100.000000
...,...,...,...,...,...,...
478548,hair serum,BROAD,elive hair serum,1,0,0.000000
478552,hair serum,BROAD,elosi serum hair,1,0,0.000000
478556,hair serum,BROAD,else goodness hair serum,1,1,100.000000
478561,hair serum,BROAD,nirwasha hair fall control serum,1,0,0.000000


In [85]:
broad_matched_keywords = broad_matched_keywords.drop('lower_tar_keyword', axis=1)
broad_matched_keywords.rename(columns={'matching_targets':'keyword_variation',
                                        'phrase_request':'request',
                                        'phrase_response_cnt':'response',
                                         'phrase_response_rate':'RR'}, inplace=True)

In [88]:
broad_matched_keywords[broad_matched_keywords['text']=='hair serum']['request'].sum()

140182

In [91]:
broad_matched_keywords.to_csv("broad_testing_163519.csv", index=False)

In [21]:
exact_keywords_req_df.head()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
0,Minimalist,EXACT,0,0.0,0.0,0.0
1,Plum,EXACT,0,0.0,0.0,0.0
2,Serum,EXACT,0,0.0,0.0,0.0
3,Plum Serum,EXACT,0,0.0,0.0,0.0
4,henna powder,EXACT,henna powder,1499.0,1499.0,100.0


In [22]:
phrase_matched_keywords = pd.read_csv("phrase_testing_163519.csv")
broad_matched_keywords = pd.read_csv("broad_testing_163519.csv")

In [23]:
phrase_matched_keywords.head()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
0,sheet mask,PHRASE,sheet mask,14900,14854,99.691275
1,mask,PHRASE,sheet mask,14900,14854,99.691275
2,sheet mask,PHRASE,garnier sheet mask,1069,406,37.97942
3,mask,PHRASE,garnier sheet mask,1069,406,37.97942
4,garnier,PHRASE,garnier sheet mask,1069,406,37.97942


### Combining all match type requests and responses in single dataframe

In [24]:
# combining all match types keywords
combined_keyword_match_df = pd.concat([exact_keywords_req_df, phrase_matched_keywords, broad_matched_keywords], axis=0, ignore_index=True)
combined_keyword_match_df.shape

(2905472, 6)

In [25]:
combined_keyword_match_df.head()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
0,Minimalist,EXACT,0,0.0,0.0,0.0
1,Plum,EXACT,0,0.0,0.0,0.0
2,Serum,EXACT,0,0.0,0.0,0.0
3,Plum Serum,EXACT,0,0.0,0.0,0.0
4,henna powder,EXACT,henna powder,1499.0,1499.0,100.0


In [26]:
combined_keyword_match_df.isnull().sum()

text                 0
match_type           0
keyword_variation    0
request              0
response             0
RR                   0
dtype: int64

In [27]:
combined_keyword_match_df.dtypes

text                  object
match_type            object
keyword_variation     object
request              float64
response             float64
RR                   float64
dtype: object

In [29]:
combined_keyword_match_df[combined_keyword_match_df['text']=='bathing powder']

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
826095,bathing powder,PHRASE,mamaearth bathing powder,4.0,0.0,0.0
858396,bathing powder,PHRASE,good vibes bathing powder,38.0,34.0,89.473684
954208,bathing powder,PHRASE,alps goodness bathing powder,2.0,2.0,100.0
969717,bathing powder,PHRASE,body bathing powder,2.0,0.0,0.0
972402,bathing powder,PHRASE,bathing powder body whitening,1.0,0.0,0.0
1241163,bathing powder,PHRASE,bathing powder,21.0,0.0,0.0
1241310,bathing powder,PHRASE,herbal bathing powder,15.0,2.0,13.333333
1243124,bathing powder,PHRASE,baby bathing powder,4.0,0.0,0.0
1243266,bathing powder,PHRASE,bathing powder for baby girl,4.0,0.0,0.0
1244032,bathing powder,PHRASE,magnum wood girl bathing powder,2.0,0.0,0.0


In [31]:
combined_keyword_match_df.head()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
0,Minimalist,EXACT,0,0.0,0.0,0.0
1,Plum,EXACT,0,0.0,0.0,0.0
2,Serum,EXACT,0,0.0,0.0,0.0
3,Plum Serum,EXACT,0,0.0,0.0,0.0
4,henna powder,EXACT,henna powder,1499.0,1499.0,100.0


In [32]:
cleaned_camp_key_df.head()

Unnamed: 0,keyword_id,marketing_campaign_id,text,match_type,marketplace_client_id,agency_id,is_negative,status_type,bidding_value,bidding_currency,bidding_value_usd
0,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059
1,135197,375534,face mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
2,135198,375534,korean mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
3,135199,375534,mask,PHRASE,163519,122,False,ACTIVE,25.0,INR,0.3383
4,389279,551819,Pilgrim Serum,BROAD,163519,122,False,ACTIVE,30.0,INR,0.4059


In [34]:
combined_keyword_match_df.head()

Unnamed: 0,text,match_type,keyword_variation,request,response,RR
0,Minimalist,EXACT,0,0.0,0.0,0.0
1,Plum,EXACT,0,0.0,0.0,0.0
2,Serum,EXACT,0,0.0,0.0,0.0
3,Plum Serum,EXACT,0,0.0,0.0,0.0
4,henna powder,EXACT,henna powder,1499.0,1499.0,100.0


### Merging campaigns and its targetted keywords with request tags based on Match types

In [35]:
combined_camp_keyword_match_df = pd.merge(cleaned_camp_key_df, combined_keyword_match_df, on=['text', 'match_type'], how='left')
combined_camp_keyword_match_df.shape

(9979768, 15)

In [36]:
combined_camp_keyword_match_df.isnull().sum()

keyword_id                  0
marketing_campaign_id       0
text                        0
match_type                  0
marketplace_client_id       0
agency_id                   0
is_negative                 0
status_type                 0
bidding_value               0
bidding_currency            0
bidding_value_usd           0
keyword_variation        2337
request                  2337
response                 2337
RR                       2337
dtype: int64

In [37]:
# Need to focus on this as why this values are null one guess is these keywords has no request in last 7 days.
combined_camp_keyword_match_df[combined_camp_keyword_match_df.isnull().any(axis=1)]

Unnamed: 0,keyword_id,marketing_campaign_id,text,match_type,marketplace_client_id,agency_id,is_negative,status_type,bidding_value,bidding_currency,bidding_value_usd,keyword_variation,request,response,RR
83773,392656,554912,sonic facial cleanser,PHRASE,163519,122,False,ACTIVE,25.00,INR,0.3383,,,,
103689,392688,554919,face serum,PHRASE,163519,122,False,ACTIVE,30.00,INR,0.4059,,,,
103690,392689,554919,serums,PHRASE,163519,122,False,ACTIVE,30.00,INR,0.4059,,,,
146967,392693,554919,niacinamide serum,PHRASE,163519,122,False,ACTIVE,30.00,INR,0.4059,,,,
147170,392697,554919,minimalist serum,PHRASE,163519,122,False,ACTIVE,30.00,INR,0.4059,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9919906,1611618,792271,pigmentation lipbalm spf,PHRASE,163519,122,False,ACTIVE,7.00,INR,0.0947,,,,
9950472,1620995,771740,Insight,PHRASE,163519,122,False,ACTIVE,15.00,INR,0.2030,,,,
9954398,1621001,771740,Concealer,BROAD,163519,122,False,ACTIVE,23.00,INR,0.3112,,,,
9954437,1621020,866346,Foundation,PHRASE,163519,122,False,ACTIVE,20.00,INR,0.2706,,,,


In [38]:
combined_camp_keyword_match_df = combined_camp_keyword_match_df.fillna('0')

In [39]:
combined_camp_keyword_match_df.dtypes

keyword_id                 int64
marketing_campaign_id      int64
text                      object
match_type                object
marketplace_client_id      int64
agency_id                  int64
is_negative                 bool
status_type               object
bidding_value            float64
bidding_currency          object
bidding_value_usd        float64
keyword_variation         object
request                   object
response                  object
RR                        object
dtype: object

In [40]:
# # Function to determine the tag based on the number of requests
# def get_reach_tag(requests):
#     if requests < 10:
#         return 'Very low Reach'
#     elif 10 <= requests < 100:
#         return 'Low Reach'
#     elif 100 <= requests < 1000:
#         return 'Moderate Reach'
#     else:
#         return 'High Reach'

# tagged_camp_key_df[['request', 'response', 'RR']] = tagged_camp_key_df[['request', 'response', 'RR']].astype(int)
# tagged_camp_key_df['reach_tag'] = tagged_camp_key_df['request'].apply(get_reach_tag)

In [41]:
combined_camp_keyword_match_df.head()

Unnamed: 0,keyword_id,marketing_campaign_id,text,match_type,marketplace_client_id,agency_id,is_negative,status_type,bidding_value,bidding_currency,bidding_value_usd,keyword_variation,request,response,RR
0,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059,sheet mask,14900.0,14854.0,99.691275
1,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059,garnier sheet mask,1069.0,406.0,37.97942
2,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059,good vibes sheet mask,657.0,655.0,99.695586
3,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059,sheet mask combo,624.0,623.0,99.839744
4,135196,375534,sheet mask,PHRASE,163519,122,False,ACTIVE,30.0,INR,0.4059,lakme face sheet mask,571.0,214.0,37.478109


In [43]:
combined_camp_keyword_match_df[combined_camp_keyword_match_df['text']=='bathing powder']

Unnamed: 0,keyword_id,marketing_campaign_id,text,match_type,marketplace_client_id,agency_id,is_negative,status_type,bidding_value,bidding_currency,bidding_value_usd,keyword_variation,request,response,RR
8020584,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,mamaearth bathing powder,4.0,0.0,0.0
8020585,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,good vibes bathing powder,38.0,34.0,89.473684
8020586,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,alps goodness bathing powder,2.0,2.0,100.0
8020587,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,body bathing powder,2.0,0.0,0.0
8020588,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,bathing powder body whitening,1.0,0.0,0.0
8020589,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,bathing powder,21.0,0.0,0.0
8020590,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,herbal bathing powder,15.0,2.0,13.333333
8020591,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,baby bathing powder,4.0,0.0,0.0
8020592,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,bathing powder for baby girl,4.0,0.0,0.0
8020593,1298356,790131,bathing powder,PHRASE,163519,122,False,ACTIVE,18.48,INR,0.2501,magnum wood girl bathing powder,2.0,0.0,0.0


In [50]:
# shared_camp_key_df = tagged_camp_key_df[['agency_id', 'keyword_id', 'marketing_campaign_id', 'text', 'match_type', 'request', 'response', 'RR', 'reach_tag']]
# shared_camp_key_df.shape

(44974, 9)

In [51]:
# shared_camp_key_df[shared_camp_key_df['text']=='sunlight']

Unnamed: 0,agency_id,keyword_id,marketing_campaign_id,text,match_type,request,response,RR,reach_tag


In [None]:
combined_camp_keyword_match_df = combined_camp_keyword_match_df.drop(['marketplace_client_id', 'agency_id', 'is_negative', 'status_type', 'bidding_value', 'bidding_currency'], axis=1)

In [103]:
combined_camp_keyword_match_df.to_csv("combined_camp_keyword&variations_df_163519.csv", index=False)

### Adding campaigns category and keywords top categories(In the next Notebook)