In [138]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import re
from tqdm import tqdm
from typing import Optional, Any, List, Tuple, Dict

### High Request volume targetted keywords having less than 20 RR%

In [139]:
# Keywords request response
requests_df = pd.read_csv("395539_keywordRR_12-18July24.csv")
requests_df.shape

(413147, 6)

In [292]:
requests_df.head()

Unnamed: 0,trimmed_keyword,keywords,word_cnt,request,response,RR,lower_search_keyword
0,milk,milk,1,21312,19751,92.675488,milk
1,eggs,eggs,1,20600,19974,96.961165,eggs
2,water,water,1,13566,13219,97.442135,water
3,groundbeef,ground beef,2,12707,3,0.023609,ground beef
4,bread,bread,1,12228,11666,95.403991,bread


In [296]:
requests_df[requests_df['keywords']=='dark chocolate'].value_counts()

trimmed_keyword  keywords        word_cnt  request  response  RR         lower_search_keyword
darkchocolate    dark chocolate  2         590      60        10.169492  dark chocolate          1
dtype: int64

In [142]:
requests_df.columns

Index(['trimmed_keyword', 'keywords', 'word_cnt', 'request', 'response', 'RR'], dtype='object')

In [143]:
# Keyword campaign level data
campaign_keywords = pd.read_csv("Wakefern_keywords_in_campaign_19thJuly.tsv000", sep='\t')
campaign_keywords.shape

(20526, 10)

In [300]:
campaign_keywords.head()

Unnamed: 0,marketplace_client_id,client_id,marketing_campaign_id,text,is_negative,match_type,status_type,bidding_value,bidding_currency,bidding_value_usd
0,395539,10043595,776063,salad dressing maries,t,EXACT,PAUSED,,,
1,395539,10046486,785999,almond milk unsweetened vanilla,f,PHRASE,ACTIVE,1.0,USD,1.0
2,395539,10043810,773748,kleenex tissue,t,PHRASE,ACTIVE,,,
3,395539,10043820,773755,charmin,t,PHRASE,ACTIVE,,,
4,395539,10042466,775235,bel veeda,t,EXACT,ACTIVE,,,


In [301]:
len(campaign_keywords['text'].unique())

10370

In [309]:
campaign_keywords[campaign_keywords['marketing_campaign_id']==847661]

Unnamed: 0,marketplace_client_id,client_id,marketing_campaign_id,text,is_negative,match_type,status_type,bidding_value,bidding_currency,bidding_value_usd
4511,395539,10043598,847661,zip locks,f,EXACT,ACTIVE,1.0,USD,1.0
5990,395539,10043598,847661,90 count ziploc,f,EXACT,ACTIVE,0.7,USD,0.7
6279,395539,10043598,847661,90 ct ziploc,f,EXACT,ACTIVE,0.7,USD,0.7
8512,395539,10043598,847661,90 count ziplock bag,f,EXACT,ACTIVE,0.7,USD,0.7
9748,395539,10043598,847661,siplocks,f,EXACT,ACTIVE,0.7,USD,0.7
10066,395539,10043598,847661,90 count ziplock,f,EXACT,ACTIVE,0.7,USD,0.7
11585,395539,10043598,847661,ziploc bags,f,EXACT,ACTIVE,0.7,USD,0.7
12982,395539,10043598,847661,siplocs,f,EXACT,ACTIVE,0.7,USD,0.7
12983,395539,10043598,847661,siplock,f,EXACT,ACTIVE,0.7,USD,0.7
15128,395539,10043598,847661,ziplocs,f,EXACT,ACTIVE,0.7,USD,0.7


In [146]:
# Filtering the Data

# Step 1: Filter the data where is_negative is 'f'
filtered_df = campaign_keywords[campaign_keywords['is_negative'] == 'f']

# Step 2: Filter the data where status_type is 'ACTIVE'
filtered_df = filtered_df[filtered_df['status_type'] == 'ACTIVE']

# Step 2: Group by 'marketing_campaign_id' and 'text'
grouped_df = filtered_df.groupby(['marketing_campaign_id', 'text'], as_index=False).first()

# Step 3: Drop duplicate combinations (this is automatically handled by the groupby and first combination)
cleaned_camp_df = grouped_df.drop_duplicates(subset=['marketing_campaign_id', 'text'])

In [147]:
cleaned_camp_df.head()

Unnamed: 0,marketing_campaign_id,text,marketplace_client_id,client_id,is_negative,match_type,status_type,bidding_value,bidding_currency,bidding_value_usd
0,771022,all natural lunch meat,395539,10040730,f,EXACT,ACTIVE,0.61,USD,0.61
1,771022,charcuterie,395539,10040730,f,EXACT,ACTIVE,0.61,USD,0.61
2,771022,cold cut,395539,10040730,f,PHRASE,ACTIVE,0.79,USD,0.79
3,771022,cold cut turkey,395539,10040730,f,EXACT,ACTIVE,0.61,USD,0.61
4,771022,cold cuts,395539,10040730,f,EXACT,ACTIVE,0.82,USD,0.82


In [148]:
cleaned_camp_df.shape

(9374, 10)

In [149]:
targetted_keywords = pd.merge(cleaned_camp_df[['marketing_campaign_id', 'text']], requests_df[['keywords', 'request', 'response', 'RR']], left_on='text', right_on='keywords', how='left')
targetted_keywords = targetted_keywords.drop('keywords', axis=1)
targetted_keywords.shape

(9412, 5)

In [150]:
targetted_keywords.head()

Unnamed: 0,marketing_campaign_id,text,request,response,RR
0,771022,all natural lunch meat,,,
1,771022,charcuterie,365.0,310.0,84.931507
2,771022,cold cut,44.0,33.0,75.0
3,771022,cold cut turkey,9.0,6.0,66.666667
4,771022,cold cuts,585.0,525.0,89.74359


In [151]:
len(targetted_keywords['text'].unique()) # 638, 563

7334

In [152]:
len(cleaned_camp_df['text'].unique()) # 638, 563

7334

In [153]:
targetted_keywords.head()

Unnamed: 0,marketing_campaign_id,text,request,response,RR
0,771022,all natural lunch meat,,,
1,771022,charcuterie,365.0,310.0,84.931507
2,771022,cold cut,44.0,33.0,75.0
3,771022,cold cut turkey,9.0,6.0,66.666667
4,771022,cold cuts,585.0,525.0,89.74359


In [154]:
targetted_keywords.isnull().sum()

marketing_campaign_id       0
text                        0
request                  4286
response                 4286
RR                       4286
dtype: int64

In [155]:
targetted_keywords = targetted_keywords.fillna(0)

In [156]:
# High request volume targetted keywords
filtered_targetted_keywords = targetted_keywords[(targetted_keywords['request'] > 10) & (targetted_keywords['RR'] < 20)]
filtered_targetted_keywords.shape

(681, 5)

In [157]:
len(filtered_targetted_keywords['text'].unique())

495

In [158]:
# len(targetted_keywords[(targetted_keywords['RR'] < 40) & (targetted_keywords['request'] > 10)])

In [159]:
filtered_targetted_keywords['request'].sum()    # RR<30: 180371.0, <20:167226.0 total:4027894

257617.0

In [291]:
# filtered_targetted_keywords.to_csv("wakefern_targetted_consied_22ndJuly.csv", index=False)

In [160]:
# Fetch the list of marketing campaign ids
filtered_targetted_keywords['marketing_campaign_id'].unique()

array([771022, 771053, 771066, 771072, 771075, 771099, 771103, 771106,
       771108, 771120, 771124, 771129, 771131, 771139, 771141, 772087,
       772127, 773200, 773709, 773743, 773745, 773752, 773755, 773765,
       773767, 774212, 774221, 774228, 774236, 774680, 774681, 775216,
       775226, 775230, 775232, 775235, 775236, 775238, 775240, 775241,
       775243, 775244, 775246, 775249, 776063, 776590, 779636, 779639,
       779640, 779645, 779646, 779653, 779656, 779659, 779661, 779664,
       780120, 780202, 780223, 780239, 780240, 782726, 782744, 785844,
       785850, 785896, 785898, 785913, 785978, 785982, 785999, 786015,
       786508, 786532, 787317, 792854, 792857, 792873, 792874, 800112,
       800457, 805984, 811991, 817309, 817317, 818893, 818908, 819879,
       820477, 821392, 821399, 821403, 821404, 821476, 822668, 822724,
       830891, 830892, 830895, 830896, 835923, 837782, 847653, 847654,
       847656, 847657, 847659, 847661, 847672, 850970, 852165, 852175,
      

### Campaign-Category Mapping

In [161]:
camp_category = pd.read_csv("wakefern_campaign_SKUs_category_20thJuly.tsv000", sep='\t')
camp_category.shape

  camp_category = pd.read_csv("wakefern_campaign_SKUs_category_20thJuly.tsv000", sep='\t')


(433743, 14)

In [162]:
camp_category.head()

Unnamed: 0,marketplace_client_id,merchant_id,marketing_campaign_id,campaign_id,sku_id,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,e_name
0,395539,P&G Personal Care,852933,604710,00812154036923___523___Shoprite,Grocery,Personal Care,Bath & Body,Body Washes & Foams,,,,,"Native Sea Salt & Cedar Body Wash, 36 fl oz"
1,395539,P&G Personal Care,852933,604710,00037000800910___466___Shoprite,Grocery,Personal Care,Deodorant,Women's Deodorant,,,,,Secret Aluminum Free Real Coconut 48 Hr Deodor...
2,395539,P&G Personal Care,852933,604710,00047400528079___215___Shoprite,Grocery,Personal Care,Shaving,Women's Shave & Hair Removal,Shave Creams & Gels,,,,Gillette Satin Care Sensitive Skin Shave Gel f...
3,395539,P&G Personal Care,852933,604710,00012044056615___122___Shoprite,Grocery,Personal Care,Deodorant,Men's Deodorant,,,,,Old Spice High Endurance Pure Sport Antiperspi...
4,395539,P&G Personal Care,852933,604710,00012044037591___604___Shoprite,Grocery,Personal Care,Deodorant,Men's Deodorant,,,,,Old Spice Men's Deodorant Aluminum-Free Fiji w...


In [163]:
# marketing_df = pd.read_csv("picknpay_mark&os_campaign_ids_16thJuly.tsv000", sep='\t')
# marketing_df.rename(columns={
#     'marketing_campaign_id': 'marketing_campaign_id',
#     'campaign_id': 'os_campaign_id',
#     'client_id': 'client_id'
# }, inplace=True)

# marketing_df.head()

In [164]:
# camp_category_grouped = pd.merge(camp_category, marketing_df[['marketing_campaign_id', 'os_campaign_id']], left_on='campaign_id', right_on='os_campaign_id', how='left')
# camp_category_grouped = camp_category_grouped.drop('os_campaign_id', axis=1)
# camp_category_grouped.head()

In [165]:
#Concatination
camp_category["concat"] = camp_category[['category_l1', 'category_l2', 'category_l3', 'category_l4', 'category_l5', 'category_l6',
       'category_l7', 'category_l8']].apply(lambda row: '>'.join([str(x).strip().lower() for x in row]), axis=1)
camp_category.head()

Unnamed: 0,marketplace_client_id,merchant_id,marketing_campaign_id,campaign_id,sku_id,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,e_name,concat
0,395539,P&G Personal Care,852933,604710,00812154036923___523___Shoprite,Grocery,Personal Care,Bath & Body,Body Washes & Foams,,,,,"Native Sea Salt & Cedar Body Wash, 36 fl oz",grocery>personal care>bath & body>body washes ...
1,395539,P&G Personal Care,852933,604710,00037000800910___466___Shoprite,Grocery,Personal Care,Deodorant,Women's Deodorant,,,,,Secret Aluminum Free Real Coconut 48 Hr Deodor...,grocery>personal care>deodorant>women's deodor...
2,395539,P&G Personal Care,852933,604710,00047400528079___215___Shoprite,Grocery,Personal Care,Shaving,Women's Shave & Hair Removal,Shave Creams & Gels,,,,Gillette Satin Care Sensitive Skin Shave Gel f...,grocery>personal care>shaving>women's shave & ...
3,395539,P&G Personal Care,852933,604710,00012044056615___122___Shoprite,Grocery,Personal Care,Deodorant,Men's Deodorant,,,,,Old Spice High Endurance Pure Sport Antiperspi...,grocery>personal care>deodorant>men's deodoran...
4,395539,P&G Personal Care,852933,604710,00012044037591___604___Shoprite,Grocery,Personal Care,Deodorant,Men's Deodorant,,,,,Old Spice Men's Deodorant Aluminum-Free Fiji w...,grocery>personal care>deodorant>men's deodoran...


In [166]:
# Step 2: Group by 'marketing_campaign_id' and 'text'
camp_category_df = camp_category.groupby(['marketing_campaign_id', 'concat'], as_index=False).first()

# # Step 3: Drop duplicate combinations (this is automatically handled by the groupby and first combination)
# camp_category_df = camp_category_df.drop_duplicates(subset=['marketing_campaign_id', 'concat'])
camp_category_df.shape

(351, 15)

In [167]:
camp_category_df = camp_category_df[['marketing_campaign_id', 'concat']]
camp_category_df.head()

Unnamed: 0,marketing_campaign_id,concat
0,771022,grocery>deli>pre-sliced deli>turkey>nan>nan>na...
1,771053,grocery>deli>pre-sliced deli>italian>nan>nan>n...
2,771053,grocery>deli>salami & italian meats>salami>nan...
3,771066,grocery>deli>pre-sliced deli>italian>nan>nan>n...
4,771072,grocery>meat>bacon & breakfast meats>nan>nan>n...


### Merge campaign targetted keywords with campaign category 

In [168]:
camp_key_cat_df = pd.merge(filtered_targetted_keywords, camp_category_df, left_on='marketing_campaign_id', right_on='marketing_campaign_id', how='inner')
camp_key_cat_df.shape

(2160, 6)

In [169]:
len(filtered_targetted_keywords['marketing_campaign_id'].unique())

131

In [170]:
len(filtered_targetted_keywords['text'].unique())

495

In [171]:
len(camp_key_cat_df['marketing_campaign_id'].unique())

122

In [172]:
len(camp_key_cat_df['text'].unique())

471

In [173]:
camp_key_cat_df.head()

Unnamed: 0,marketing_campaign_id,text,request,response,RR,concat
0,771022,low sodium ham,192.0,4.0,2.083333,grocery>deli>pre-sliced deli>turkey>nan>nan>na...
1,771022,lunch meats,19.0,0.0,0.0,grocery>deli>pre-sliced deli>turkey>nan>nan>na...
2,771022,luncheon meat,43.0,0.0,0.0,grocery>deli>pre-sliced deli>turkey>nan>nan>na...
3,771022,lunchmeat deli,14.0,1.0,7.142857,grocery>deli>pre-sliced deli>turkey>nan>nan>na...
4,771022,meat,1652.0,0.0,0.0,grocery>deli>pre-sliced deli>turkey>nan>nan>na...


In [174]:
camp_key_cat_df.isnull().sum()

marketing_campaign_id    0
text                     0
request                  0
response                 0
RR                       0
concat                   0
dtype: int64

In [175]:
camp_key_cat_df[camp_key_cat_df['text']=='beef']

Unnamed: 0,marketing_campaign_id,text,request,response,RR,concat
80,771120,beef,2152.0,2.0,0.092937,grocery>meat>hot dogs>nan>nan>nan>nan>nan


In [176]:
key_cat_grouped = camp_key_cat_df.groupby(['text', 'concat'], as_index=False).first()

# Step 3: Drop duplicate combinations (this is automatically handled by the groupby and first combination)
key_cat_grouped = key_cat_grouped.drop_duplicates(subset=['text', 'concat'])

In [177]:
key_cat_grouped.head()

Unnamed: 0,text,concat,marketing_campaign_id,request,response,RR
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,817317,15.0,1.0,6.666667
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,817317,15.0,1.0,6.666667
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan,850970,61.0,0.0,0.0
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan,850970,61.0,0.0,0.0
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...,850970,61.0,0.0,0.0


In [178]:
key_cat_grouped.shape

(2013, 6)

In [179]:
targetted_keywords_category_mapp = key_cat_grouped[['text', 'concat']]
targetted_keywords_category_mapp.shape

(2013, 2)

In [180]:
targetted_keywords_category_mapp.head()

Unnamed: 0,text,concat
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...


In [181]:
len(targetted_keywords_category_mapp['text'].unique())

471

In [182]:
# targetted_keywords_category_mapp.to_csv('targetted_keywords_category_mapp.csv', index=False)

In [183]:
# Adding scores to mappings

In [184]:
# Adding scores to mappings
# s3://os-search-relevancy-data/prod/keyword_category_data_v2_10008513.csv

In [185]:
# redis_mapping_df = pd.read_csv("keyword_category_data_v2_10008513_Redis.csv")
# redis_mapping_df.head()

In [186]:
# # Group by 'keywords' and find the category with the highest 'scores'
# top_categories = redis_mapping_df.loc[redis_mapping_df.groupby('keyword')['count'].idxmax()]
# top_categories.head()

In [187]:
# redis_mapping_df[redis_mapping_df['keyword']=='spin']

In [188]:
# top_categories[top_categories['keyword']=='spin']

In [189]:
# top_categories['top_score'] = top_categories['count'] - (top_categories['count']*0.25) #25% less scores is added

In [190]:
# 1988.8-(1988.8*0.20)

In [191]:
# targetted_keywords_category_mapp_V2 = pd.merge(targetted_keywords_category_mapp, top_categories[['keyword', 'top_score']], left_on='text', right_on='keyword', how='left')
# targetted_keywords_category_mapp_V2.head()

In [192]:
# targetted_keywords_category_mapp_V2[targetted_keywords_category_mapp_V2['text']=='spin']

### Extract the phrase variation of targetted keywords

In [193]:
# Function for the manipulations on the targeted keywords
def targeted_keywords_manipulation(targeted_keywords) -> pd.DataFrame:
    targeted_keywords["lower_tar_keyword"] = (targeted_keywords["text"].str.lower().str.strip())
    targeted_keywords = targeted_keywords.loc[(~(targeted_keywords["lower_tar_keyword"].isna())) & (targeted_keywords["lower_tar_keyword"] != "")]    
    return targeted_keywords.lower_tar_keyword

In [194]:
# Function for the manipulations on the search queries
def search_query_manipulation(search_queries)->Tuple[pd.DataFrame, List[str]]:
    search_queries["lower_search_keyword"] = search_queries["keywords"].str.lower().str.strip()
    search_query_df_v2 = (
        search_queries.groupby(["trimmed_keyword", "lower_search_keyword", "word_cnt"])
                                .agg({"request": "sum", "response": "sum"})
                                .reset_index()
                                .sort_values(by="request", ascending=False)
                                )

    phrase_df = search_query_df_v2.copy()
    phrase_df = phrase_df.rename(
                                columns={
                                    "lower_search_keyword": "phrase_keyword",
                                    "request": "phrase_request",
                                    "response": "phrase_response_cnt",
                                }
                                ).drop(columns=["word_cnt"])

    # Create a list with phrase candidates
    phrase_set_lst = phrase_df["phrase_keyword"].tolist()
    return phrase_df, phrase_set_lst

In [195]:
cleaned_targetted_keywords = targeted_keywords_manipulation(targetted_keywords_category_mapp)
cleaned_targetted_keywords = pd.DataFrame(cleaned_targetted_keywords) # dataset should be in dataframe
cleaned_targetted_keywords.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeted_keywords["lower_tar_keyword"] = (targeted_keywords["text"].str.lower().str.strip())


Unnamed: 0,lower_tar_keyword
0,8 oclock coffee
1,8 oclock coffee
2,aha
3,aha
4,aha


In [196]:
phrase_df, phrase_set_lst = search_query_manipulation(requests_df)
phrase_set_lst   # This has to be in series

['milk',
 'eggs',
 'water',
 'ground beef',
 'bread',
 'ice cream',
 'butter',
 'chicken',
 'bacon',
 'watermelon',
 'cream cheese',
 'chicken breast',
 'yogurt',
 'olive oil',
 'pasta',
 'lettuce',
 'bananas',
 'coffee',
 'sour cream',
 'cereal',
 'strawberries',
 'potatoes',
 'tomatoes',
 'tide',
 'paper towels',
 'steak',
 'cheese',
 'chips',
 'hot dogs',
 'orange juice',
 'pepsi',
 'tomato',
 'american cheese',
 'toilet paper',
 'ground turkey',
 'rice',
 'shrimp',
 'salad',
 'grapes',
 'broccoli',
 'gatorade',
 'onion',
 'peanut butter',
 'cottage cheese',
 'soda',
 'apples',
 'corn',
 'tuna',
 'star wars oreo',
 'cucumber',
 'sausage',
 'avocado',
 'https://bit.ly/3jnsjeh?r=qr',
 'salmon',
 'almond milk',
 'carrots',
 'blueberries',
 'spinach',
 'coke',
 'garlic',
 'paper plates',
 'seltzer',
 'ketchup',
 'onions',
 'mayo',
 'dawn',
 'mozzarella',
 'sugar',
 'cookies',
 'rolls',
 'ham',
 'peppers',
 'turkey',
 'salsa',
 'oreo',
 'banana',
 'shredded cheese',
 'scott',
 'peaches',

In [197]:
# Function to find the phrase variations
def find_phrase_variations(targetted_keywords: List[str], search_query: List[str], score_cutoff=100):
    targeted_query_len = len(targetted_keywords)
    
    filtered_phrase_set = [query for query in search_query if targetted_keywords in query]
    
    matches = process.extract(targetted_keywords, filtered_phrase_set, scorer=fuzz.token_set_ratio, limit=None)
    score_filtered_lst = [key for key, score, _ in matches if score == score_cutoff]
    
    return score_filtered_lst

In [198]:
from tqdm import tqdm
tqdm.pandas(desc="Finding the subqueries")
cleaned_targetted_keywords.loc[:, "matching_targets"] = cleaned_targetted_keywords["lower_tar_keyword"].progress_apply(lambda x: find_phrase_variations(x, phrase_set_lst))
cleaned_targetted_keywords

Finding the subqueries: 100%|██████████████████████████████████████████████████████| 2013/2013 [03:08<00:00, 10.67it/s]


Unnamed: 0,lower_tar_keyword,matching_targets
0,8 oclock coffee,"[8 oclock coffee, 8 oclock coffee pods, 8 oclo..."
1,8 oclock coffee,"[8 oclock coffee, 8 oclock coffee pods, 8 oclo..."
2,aha,"[aha, aha seltzer, aha water, aha sparkling wa..."
3,aha,"[aha, aha seltzer, aha water, aha sparkling wa..."
4,aha,"[aha, aha seltzer, aha water, aha sparkling wa..."
...,...,...
2008,wymans,"[wymans, wymans frozen, wymans frozen fruit, f..."
2009,yellow mustard,"[yellow mustard, frenchs yellow mustard, yello..."
2010,yellow onion,"[yellow onion, organic yellow onion, diced yel..."
2011,zip locks,"[zip locks, quart zip locks, zip locks bags, z..."


In [199]:
filtered_df = cleaned_targetted_keywords[cleaned_targetted_keywords["matching_targets"].apply(len) == 0]
targeted_matched_keywords = cleaned_targetted_keywords.explode("matching_targets")
print("Pre Shape: ", targeted_matched_keywords.shape)
targeted_matched_keywords = targeted_matched_keywords.drop_duplicates(subset=['lower_tar_keyword', 'matching_targets'])
print("Post Shape: ", targeted_matched_keywords.shape)

Pre Shape:  (211920, 2)
Post Shape:  (48161, 2)


In [200]:
targeted_matched_keywords.head()

Unnamed: 0,lower_tar_keyword,matching_targets
0,8 oclock coffee,8 oclock coffee
0,8 oclock coffee,8 oclock coffee pods
0,8 oclock coffee,8 oclock coffee beans
2,aha,aha
2,aha,aha seltzer


In [201]:
targeted_matched_keywords = pd.merge(targeted_matched_keywords, phrase_df[["phrase_keyword", "phrase_request", "phrase_response_cnt"]], how="inner", left_on="matching_targets", right_on="phrase_keyword")
targeted_matched_keywords = targeted_matched_keywords.drop('phrase_keyword', axis=1)
targeted_matched_keywords['phrase_response_rate'] = targeted_matched_keywords['phrase_response_cnt']*100/ targeted_matched_keywords['phrase_request']
targeted_matched_keywords.head()
# print(
#     "Distinct targeted keywords with phrase candidates: ",
#     targeted_keyword_df_v4.lower_tar_keyword.nunique(),
# )
# targeted_keyword_df_v4.to_csv(
#     f"wakefern_phrase_variations.tsv", sep="\t", index=False)

Unnamed: 0,lower_tar_keyword,matching_targets,phrase_request,phrase_response_cnt,phrase_response_rate
0,8 oclock coffee,8 oclock coffee,15,1,6.666667
1,8 oclock coffee,8 oclock coffee pods,2,0,0.0
2,pods,8 oclock coffee pods,2,0,0.0
3,8 oclock coffee,8 oclock coffee beans,2,2,100.0
4,aha,aha,61,0,0.0


In [202]:
phrase_cat_mapping_df = pd.merge(targeted_matched_keywords, targetted_keywords_category_mapp, left_on='lower_tar_keyword', right_on='text', how='inner')
phrase_cat_mapping_df.shape

(211920, 8)

In [203]:
phrase_cat_mapping_df = phrase_cat_mapping_df.groupby(['matching_targets', 'concat'], as_index=False).first()
phrase_cat_mapping_df.head()

Unnamed: 0,matching_targets,concat,lower_tar_keyword_x,phrase_request,phrase_response_cnt,phrase_response_rate,text,lower_tar_keyword_y
0,$9.99 bowl * basket 2lb beef burgers,grocery>frozen>frozen all natural>nan>nan>nan>...,burgers,1,0,0.0,burgers,burgers
1,$9.99 bowl * basket 2lb beef burgers,grocery>frozen>frozen meat alternatives>nan>na...,burgers,1,0,0.0,burgers,burgers
2,$9.99 bowl * basket 2lb beef burgers,grocery>meat>hot dogs>nan>nan>nan>nan>nan,beef,1,0,0.0,beef,beef
3,%85 meat,grocery>breakfast & cereal>frozen breakfast>br...,meat,1,0,0.0,meat,meat
4,%85 meat,grocery>deli>pre-sliced deli>turkey>nan>nan>na...,meat,1,0,0.0,meat,meat


In [205]:
phrase_cat_mapping_df[phrase_cat_mapping_df['lower_tar_keyword_x']=='steak']

Unnamed: 0,matching_targets,concat,lower_tar_keyword_x,phrase_request,phrase_response_cnt,phrase_response_rate,text,lower_tar_keyword_y


In [206]:
targetted_keywords_category_mapp.columns

Index(['text', 'concat', 'lower_tar_keyword'], dtype='object')

In [None]:
# Handling of Branded keywords
# brand_df = pd.read_csv("brand_search_queries_395539.csv")
# brand_df.shape

In [None]:
# brand_df.head()

In [None]:
# pd.merge(phrase_cat_mapping_df, brand_df[['actual_search_query', 'extracted_brand']], left_on='matching_targets')


### Renameing and append targetted and phrase matched datasets

In [207]:
targetted_keywords_category_mapp = targetted_keywords_category_mapp.drop('lower_tar_keyword', axis=1)

In [209]:
targetted_keywords_category_mapp.columns

Index(['text', 'concat'], dtype='object')

In [213]:
phrase_cat_mapping_df.columns

Index(['text', 'concat'], dtype='object')

In [211]:
# FIlter the phrase matched keywords having RR <50%
phrase_cat_mapping_df = phrase_cat_mapping_df[phrase_cat_mapping_df['phrase_response_rate']<50]
phrase_cat_mapping_df.shape

(152132, 8)

In [212]:
phrase_cat_mapping_df = phrase_cat_mapping_df.drop(['lower_tar_keyword_x', 'phrase_request', 'phrase_response_cnt', 'phrase_response_rate', 'text','lower_tar_keyword_y'], axis=1)
phrase_cat_mapping_df = phrase_cat_mapping_df.rename(columns={'matching_targets':'text', 'concat':'concat'})

In [214]:
targetted_keyword_category_map_final =  pd.concat([targetted_keywords_category_mapp, phrase_cat_mapping_df], axis=0, ignore_index=True)
targetted_keyword_category_map_final.shape

(154145, 2)

In [215]:
targetted_keyword_category_map_final.head()

Unnamed: 0,text,concat
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...


In [216]:
phrase_cat_mapping_df.head()

Unnamed: 0,text,concat
0,$9.99 bowl * basket 2lb beef burgers,grocery>frozen>frozen all natural>nan>nan>nan>...
1,$9.99 bowl * basket 2lb beef burgers,grocery>frozen>frozen meat alternatives>nan>na...
2,$9.99 bowl * basket 2lb beef burgers,grocery>meat>hot dogs>nan>nan>nan>nan>nan
3,%85 meat,grocery>breakfast & cereal>frozen breakfast>br...
4,%85 meat,grocery>deli>pre-sliced deli>turkey>nan>nan>na...


In [217]:
phrase_cat_mapping_df[phrase_cat_mapping_df['text']=='salmon']

Unnamed: 0,text,concat
163341,salmon,grocery>frozen>frozen meals & sides>seafood en...
163342,salmon,grocery>frozen>frozen seafood>fish>nan>nan>nan...
163343,salmon,grocery>frozen>frozen seafood>shellfish>nan>na...


In [218]:
targetted_keyword_category_map_final[targetted_keyword_category_map_final['text']=='salmon']

Unnamed: 0,text,concat
1626,salmon,grocery>frozen>frozen meals & sides>seafood en...
1627,salmon,grocery>frozen>frozen seafood>fish>nan>nan>nan...
1628,salmon,grocery>frozen>frozen seafood>shellfish>nan>na...
124335,salmon,grocery>frozen>frozen meals & sides>seafood en...
124336,salmon,grocery>frozen>frozen seafood>fish>nan>nan>nan...
124337,salmon,grocery>frozen>frozen seafood>shellfish>nan>na...


In [219]:
# Adding scores to mappings
# s3://os-search-relevancy-data/prod/keyword_category_data_v2_10008513.csv

In [221]:
redis_mapping_df = pd.read_csv("keyword_category_data_v2_395539_Redis_20thJuly.csv")
redis_mapping_df.head()

Unnamed: 0,keyword,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,source,count,tag,advertisable_sku_count
0,eggs,Grocery,Dairy,Eggs & Egg Substitutes,Fresh Eggs,,,,,auto,20256.4,advertisable_skus_present,71429672.0
1,eggs,grocery,dairy,eggs & egg substitutes,egg substitutes,,,,,manual,12886.603,advertisable_skus_present,1186.0
2,eggs,grocery,dairy,eggs & egg substitutes,other,,,,,manual,12758.009,,
3,eggs,grocery,dairy,eggs & egg substitutes,hard cooked eggs,,,,,manual,12475.422,,
4,groundbeef,Grocery,Meat,Beef,,,,,,auto,10872.0,advertisable_skus_present,497226.0


In [222]:
# Group by 'keywords' and find the category with the highest 'scores'
top_categories = redis_mapping_df.loc[redis_mapping_df.groupby('keyword')['count'].idxmax()]
top_categories.head()

Unnamed: 0,keyword,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,source,count,tag,advertisable_sku_count
76855,0,Grocery,Dairy,"Cottage Cheese, Cream Cheese & Spreads",Cottage Cheese,,,,,auto,3.0,advertisable_skus_present,5395.0
75769,0,Grocery,Pantry,Baking,Flour & Meal,,,,,auto,3.0,advertisable_skus_present,7578.0
56893,179900,Grocery,Produce,Fresh Fruit,Peaches,,,,,auto,4.0,,
77284,180207,Grocery,Pantry,Soups & Canned Goods,Canned Tuna & Seafood,Tuna,,,,auto,3.0,advertisable_skus_present,5770.0
77054,180401,Grocery,Produce,Fresh Fruit,Avocados,,,,,auto,3.0,,


In [224]:
redis_mapping_df[redis_mapping_df['keyword']=='groundbeef']

Unnamed: 0,keyword,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,source,count,tag,advertisable_sku_count
4,groundbeef,Grocery,Meat,Beef,,,,,,auto,10872.0000,advertisable_skus_present,497226.0
23,groundbeef,grocery,specialty shops,ebt eligible,meat,beef,,,,manual,7252.7290,,
31,groundbeef,grocery,deli,roast beef & other beef cuts,roast beef,,,,,manual,6408.1445,advertisable_skus_present,3484.0
33,groundbeef,grocery,frozen,frozen meat,frozen beef,,,,,manual,6380.4927,advertisable_skus_present,2121.0
36,groundbeef,grocery,specialty shops,ebt eligible,deli,roast beef & other beef cuts,,,,manual,6323.9106,advertisable_skus_present,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81419,groundbeef,Grocery,Frozen,Frozen Seafood,Frozen Shrimp,,,,,auto,2.8000,advertisable_skus_present,22878.0
81485,groundbeef,Grocery,Beverages,Tea,Iced Tea,,,,,auto,2.8000,advertisable_skus_present,89982.0
81777,groundbeef,Grocery,Bread & Bakery,Packaged Bread,White,,,,,auto,2.8000,advertisable_skus_present,5676.0
85420,groundbeef,Grocery,Dairy,Packaged Cheese,Cheese Blends,,,,,auto,2.4000,advertisable_skus_present,17970.0


In [230]:
top_categories[top_categories['keyword']=='groundbeef']

Unnamed: 0,keyword,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,source,count,tag,advertisable_sku_count,top_score
4,groundbeef,Grocery,Meat,Beef,,,,,,auto,10872.0,advertisable_skus_present,497226.0,8154.0


In [229]:
top_categories['top_score'] = top_categories['count'] - (top_categories['count']*0.25) #25% less scores is added

In [233]:
targetted_keyword_category_map_final.head()

Unnamed: 0,text,concat,trimmed_keyword
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,8oclockcoffee
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,8oclockcoffee
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan,aha
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan,aha
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...,aha


In [232]:
# As redis data has trimmed keywords to merged with targetted keywords data we need to create trimmed columns in targetted data
targetted_keyword_category_map_final['trimmed_keyword'] = targetted_keyword_category_map_final['text'].str.strip().str.replace(' ', '')

In [234]:
targetted_keywords_category_score_V2 = pd.merge(targetted_keyword_category_map_final, top_categories[['keyword', 'top_score']], left_on='trimmed_keyword', right_on='keyword', how='left')
targetted_keywords_category_score_V2.head()

Unnamed: 0,text,concat,trimmed_keyword,keyword,top_score
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,8oclockcoffee,8oclockcoffee,375.0
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,8oclockcoffee,8oclockcoffee,375.0
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan,aha,aha,8.1
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan,aha,aha,8.1
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...,aha,aha,8.1


In [235]:
targetted_keywords_category_score_V2 = targetted_keywords_category_score_V2.drop('keyword', axis=1)

In [237]:
targetted_keywords_category_score_V2[targetted_keywords_category_score_V2['text']=='ground beef']

Unnamed: 0,text,concat,trimmed_keyword,top_score
69605,ground beef,grocery>meat>hot dogs>nan>nan>nan>nan>nan,groundbeef,8154.0


In [239]:
# targetted_keywords_category_score_V2.drop('keyword', axis=1)
targetted_keywords_category_score_V2 = targetted_keywords_category_score_V2.drop_duplicates(subset=['text', 'concat', 'trimmed_keyword', 'top_score'])

# Replace NaN values in 'top_score' column with a specific value (e.g., 300)
targetted_keywords_category_score_V2['top_score'] = targetted_keywords_category_score_V2['top_score'].fillna(500)

# Replace NaN values with empty string in the entire DataFrame, except 'top_score'
targetted_keywords_category_score_V2 = targetted_keywords_category_score_V2.fillna('')

targetted_keywords_category_score_V2.shape

(152132, 4)

In [240]:
targetted_keywords_category_score_V2.head()

Unnamed: 0,text,concat,trimmed_keyword,top_score
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,8oclockcoffee,375.0
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,8oclockcoffee,375.0
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan,aha,8.1
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan,aha,8.1
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...,aha,8.1


In [241]:
# Handling of Branded keywords
brand_df = pd.read_csv("brand_search_queries_395539.csv")
brand_df.shape

(579493, 3)

In [242]:
brand_df.head()

Unnamed: 0,marketplace_client_id,actual_search_query,extracted_brand
0,395539,*sargento sliced,sargento
1,395539,*kens lite,ken's
2,395539,00 serenita bella primitivo,bella
3,395539,/jolly-rancher-freezer-pops,jolly rancher
4,395539,*black bear,black bear


In [243]:
targetted_keywords_category_score_V2 = pd.merge(targetted_keywords_category_score_V2, brand_df[['actual_search_query', 'extracted_brand']], left_on='text', right_on='actual_search_query', how='left')
targetted_keywords_category_score_V2 = targetted_keywords_category_score_V2.drop('actual_search_query', axis=1)
targetted_keywords_category_score_V2.shape

(152132, 5)

In [244]:
targetted_keywords_category_score_V2.head()

Unnamed: 0,text,concat,trimmed_keyword,top_score,extracted_brand
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,8oclockcoffee,375.0,
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,8oclockcoffee,375.0,
2,aha,grocery>beverages>soda>cola>nan>nan>nan>nan,aha,8.1,aha
3,aha,grocery>beverages>soda>ginger ale>nan>nan>nan>nan,aha,8.1,aha
4,aha,grocery>beverages>soda>lemon - lime>nan>nan>na...,aha,8.1,aha


In [258]:
targetted_keywords_category_score_V2.isnull().sum()

text                   0
concat                 0
trimmed_keyword        0
top_score              0
extracted_brand    77235
dtype: int64

In [261]:
# Considering the non-branded keywords only
targetted_keywords_category_score_V3 = targetted_keywords_category_score_V2[targetted_keywords_category_score_V2['extracted_brand'].isna()]
targetted_keywords_category_score_V3.head()

Unnamed: 0,text,concat,trimmed_keyword,top_score,extracted_brand
0,8 oclock coffee,grocery>beverages>coffee>espresso>nan>nan>nan>nan,8oclockcoffee,375.0,
1,8 oclock coffee,grocery>beverages>coffee>ground>nan>nan>nan>nan,8oclockcoffee,375.0,
9,air fryer,grocery>frozen>frozen meat>frozen poultry>nan>...,airfryer,7.5,
10,alfredo,grocery>beverages>drink mixes>nan>nan>nan>nan>nan,alfredo,240.0,
11,alfredo,grocery>beverages>tea>black>nan>nan>nan>nan,alfredo,240.0,


In [263]:
targetted_keywords_category_score_V3.shape

(77235, 5)

In [264]:
# Delimiting the similar category column
targetted_keywords_category_score_V3[['category_l1', 'category_l2', 'category_l3','category_l4','category_l5','category_l6','category_l7','category_l8']] = targetted_keywords_category_score_V3['concat'].str.split(">", expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targetted_keywords_category_score_V3[['category_l1', 'category_l2', 'category_l3','category_l4','category_l5','category_l6','category_l7','category_l8']] = targetted_keywords_category_score_V3['concat'].str.split(">", expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targetted_keywords_category_score_V3[['category_l1', 'category_l2', 'category_l3','category_l4','category_l5','category_l6','category_l7','category_l8']] = targetted_keywords_category_score_V3['concat'].str.split(">", expand=True)
A value 

In [266]:
# Addition of column as per the given format
targetted_keywords_category_score_V3['sku_id'] = '20072024'

# Create a new column with trimmed and space-free keywords
# targetted_keywords_category_score_V2['trimmed_keyword'] = targetted_keywords_category_score_V2['text'].str.strip().str.replace(' ', '')

targetted_keywords_category_score_V3['count'] = targetted_keywords_category_score_V3['top_score']

targetted_keywords_category_score_V3['source'] = 'manual'
targetted_keywords_category_score_V3.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targetted_keywords_category_score_V3['sku_id'] = '20072024'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targetted_keywords_category_score_V3['count'] = targetted_keywords_category_score_V3['top_score']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targetted_keywords_category_score_V3['source'] 

Index(['text', 'concat', 'trimmed_keyword', 'top_score', 'extracted_brand',
       'category_l1', 'category_l2', 'category_l3', 'category_l4',
       'category_l5', 'category_l6', 'category_l7', 'category_l8', 'sku_id',
       'count', 'source'],
      dtype='object')

In [267]:
targetted_keywords_category_score_V3 = targetted_keywords_category_score_V3.drop(['concat', 'extracted_brand'], axis=1)

In [268]:
targetted_keywords_category_score_V3.head()

Unnamed: 0,text,trimmed_keyword,top_score,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,sku_id,count,source
0,8 oclock coffee,8oclockcoffee,375.0,grocery,beverages,coffee,espresso,,,,,20072024,375.0,manual
1,8 oclock coffee,8oclockcoffee,375.0,grocery,beverages,coffee,ground,,,,,20072024,375.0,manual
9,air fryer,airfryer,7.5,grocery,frozen,frozen meat,frozen poultry,,,,,20072024,7.5,manual
10,alfredo,alfredo,240.0,grocery,beverages,drink mixes,,,,,,20072024,240.0,manual
11,alfredo,alfredo,240.0,grocery,beverages,tea,black,,,,,20072024,240.0,manual


In [270]:
targetted_keywords_category_score_V3.columns

Index(['text', 'trimmed_keyword', 'top_score', 'category_l1', 'category_l2',
       'category_l3', 'category_l4', 'category_l5', 'category_l6',
       'category_l7', 'category_l8', 'sku_id', 'count', 'source'],
      dtype='object')

In [271]:
# targetted_keywords_category_score_V2['top_score'] = 
# Rearrange columns
desired_order = [
    'trimmed_keyword', 'text', 'sku_id', 'category_l1', 'category_l2', 'category_l3',
    'category_l4', 'category_l5', 'category_l6', 'category_l7', 'category_l8',
    'count', 'top_score', 'source'
]

targetted_keywords_category_score_V3 = targetted_keywords_category_score_V3[desired_order]
targetted_keywords_category_score_V3.head()

Unnamed: 0,trimmed_keyword,text,sku_id,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,count,top_score,source
0,8oclockcoffee,8 oclock coffee,20072024,grocery,beverages,coffee,espresso,,,,,375.0,375.0,manual
1,8oclockcoffee,8 oclock coffee,20072024,grocery,beverages,coffee,ground,,,,,375.0,375.0,manual
9,airfryer,air fryer,20072024,grocery,frozen,frozen meat,frozen poultry,,,,,7.5,7.5,manual
10,alfredo,alfredo,20072024,grocery,beverages,drink mixes,,,,,,240.0,240.0,manual
11,alfredo,alfredo,20072024,grocery,beverages,tea,black,,,,,240.0,240.0,manual


In [276]:
targetted_keywords_category_score_V3[targetted_keywords_category_score_V3['text']=='salmon']

Unnamed: 0,trimmed_keyword,text,sku_id,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,count,top_score,source
1626,salmon,salmon,20072024,grocery,frozen,frozen meals & sides,seafood entrees,,,,,1617.3,1617.3,manual
1627,salmon,salmon,20072024,grocery,frozen,frozen seafood,fish,,,,,1617.3,1617.3,manual
1628,salmon,salmon,20072024,grocery,frozen,frozen seafood,shellfish,,,,,1617.3,1617.3,manual


In [275]:
targetted_keywords_category_score_V3.tail()

Unnamed: 0,trimmed_keyword,text,sku_id,category_l1,category_l2,category_l3,category_l4,category_l5,category_l6,category_l7,category_l8,count,top_score,source
152127,zwanluncheonmeat,zwan luncheon meat,20072024,grocery,meat,sausages,more sausage,,,,,500.0,500.0,manual
152128,çannédchicken,çannéd chicken,20072024,grocery,dietary & lifestyle,organic,meat,,,,,500.0,500.0,manual
152129,çannédchicken,çannéd chicken,20072024,grocery,frozen,frozen meat,frozen poultry,,,,,500.0,500.0,manual
152130,çannédchicken,çannéd chicken,20072024,grocery,meat,prepared & ready to heat,,,,,,500.0,500.0,manual
152131,çannédchicken,çannéd chicken,20072024,grocery,meat,sausages,more sausage,,,,,500.0,500.0,manual


In [278]:
# targetted_keywords_category_score_V2.to_csv("targetted_and_phrase_keyword_category_mapping_10008513(utf-8).csv", index=False, encoding='utf-8')

In [279]:
# Create an Excel writer object
excel_writer = pd.ExcelWriter('Targetted_and_phrase_keyword_category_mapping_395539(utf-8).xlsx', engine='xlsxwriter')

# Write each DataFrame to a separate sheet in the same Excel file
targetted_keywords_category_score_V3.to_excel(excel_writer, sheet_name='key_cat_mapping', index=False)

# Save the Excel file
excel_writer.save()

  excel_writer.save()


In [280]:
total_keywords = pd.merge(targetted_keywords_category_score_V3, requests_df, left_on='text', right_on='keywords', how='left')
total_keywords.shape

(77294, 21)

In [282]:
print(total_keywords['request'].sum()), print(total_keywords['response'].sum())

1128193.0
85774.0


(None, None)

In [283]:
print(requests_df['request'].sum()), print(requests_df['response'].sum())

3871458
1110709


(None, None)

In [288]:
(85774.0/1128193.0)*100

7.602777184400186

In [285]:
print(f"Our Keywords Contribution in requests {(total_keywords['request'].sum())/(requests_df['request'].sum())*100} and responses {(total_keywords['response'].sum())/(requests_df['res'].sum())*100}")

Our Keywords Contribution in requests 29.141295088310397 and responses 7.722454756376333
