In [1]:
import requests
import json
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)

### This workbook shows the procedure for getting the rank and link for a list keywords using serper.dec Groogle Search API

In [3]:
def load_api_key(key_path: str='secret/key.txt', key_name: str='api_key') -> None:
    key_name = str(key_name)
    with open(key_path, 'r') as f:
        key_value = f.read().strip()
    globals()[key_name] = key_value

    print(f"API key set to global varible '{key_name}'")

In [4]:
load_api_key(key_path='secret/key.txt', key_name='api_key')

API key set to global varible 'api_key'


In [5]:
#keywords = ["swollen ankles", "horny goat weed", "scalp psoriasis"]

url = "https://google.serper.dev/search"
location = "Calgary,Alberta,Canada"
language = "en"
device = "desktop"

In [6]:
# peforms a serch on one kw and returns the Xth page of relults

def search_kw(kw, page=1, url="https://google.serper.dev/search"):
    payload = json.dumps({
      "q": kw, # keyword to search for
      "gl": "us", # location
      "hl": "en", # language
      "autocorrect": False,
      "page": page
    })

    headers = {
      "X-API-KEY": api_key,
      "Content-Type": "application/json"
    }

    response = requests.request("POST", url=url, headers=headers, data=payload)
    return(response)

In [7]:
# prints the response for 1 keyword

def print_response(response):
    print (f"Results for keyword: \"{response.json()['searchParameters']['q']}\"")
    for rank, result in enumerate(response.json()["organic"], 1):
        print(f"Rank: {rank}, Title: {result['title']}, Link: {result['link']}")

In [8]:
# runs search_kw over multiple pages and returns them in a list

def get_pages(page_count, kw, url="https://google.serper.dev/search"):
    pages = list()
    for page in range(1, page_count+1):
        pages.append(search_kw(kw, page, url))
    return pages

In [9]:
# extract kw, page, rank, and link from 1 page of response info and return df

def extract_response(response):
    df = pd.DataFrame(columns = ['kw', 'page', 'rank','link'])
    q = response.json()['searchParameters']['q']
    page = response.json()['searchParameters']['page']
    for rank, result in enumerate(response.json()["organic"], 1):
        link = result['link']
        df.loc[len(df)] = [q, page, rank, link]
    return(df)

In [10]:
# takes a list of pages of response, iterates extract_response(), extracts and renumbers responses, returns df of ky, rank, link

def collate_pages(pages, max_rank=None):
    df = pd.DataFrame()
    for response in pages:
        data = extract_response(response)
        df = pd.concat([df, data], ignore_index=True)
    df = df.reset_index(drop=True)
    df["rank"] = df.index + 1
    if max_rank is not None:
        df = df[df['rank'] <= max_rank]
    df.drop(["page"], axis=1, inplace=True)
    return df

In [11]:
# takes df containing "Keywords" and iterates collate_pages() returns df of kw

def get_links_for_kws(df_in):
    df_out = pd.DataFrame()
    keywords = df_in["Keyword"]
    for kw in keywords:
        kw_pages = get_pages(3, kw) # get 3 pages (<= 30 records) for each kw
        df = collate_pages(kw_pages, 20) # colate pages, limit to top 20 records.
        df_out = pd.concat([df_out, df], ignore_index=True)
    return(df_out)

In [19]:
def combine_csvs_in_df(files_to_combine):
    combined_df = pd.DataFrame()
    for file in files_to_combine:
        df = pd.read_csv(file, header=0)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

In [12]:
# load kw
df_in = pd.read_csv('data/kw_1_000.csv')
#df_in = df_in.head() # FOR TESTING PURPOSES ONLY BECAUSE I AM LIMITED BY MY QUERY QUOTA OF 1440
#df_in.iloc[400:500]

In [13]:
df_400 = df_in.iloc[0:400]
df_400_800 = df_in.iloc[400:800]
df_800_1000 = df_in.iloc[800:]

In [15]:
df_400_800.head(1)

Unnamed: 0,kw,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
400,foods for inflammation,"People also ask, Sitelinks, Top stories, Thumb...",1900,76,0.4,194,https://www.webmd.com/diet/anti-inflammatory-d...,foods for inflammation*https://www.webmd.com/d...,16


In [None]:
# ***DANGER*** GETS DATA FROM SERPER.API
# REMEMBER YOU ARE LIMITED BY A DAILY QUERY QUOTA OF 1440

#df_linked_kw_out_400 = get_links_for_kws(df_400) # DOES 1200 QUERIES
#df_linked_kw_out_400_800 = get_links_for_kws(df_400_800) # DOES 1200 QUERIES
#df_linked_kw_out_800_1000 = get_links_for_kws(df_800_1000) # DOES 600 QUERIES

In [16]:
def print_dfs_lengths(df_names):
    lengths = [len(globals()[df_name]) for df_name in df_names]
    
    for i, length in enumerate(lengths):
        print(f"DataFrame {i+1} ({df_names[i]}): {length}")
    
    total_length = sum(lengths)
    print(f"Total length: {total_length}")
    
    return total_length

In [None]:
df_names = ['df_linked_kw_400', 'df_linked_kw_400_800', 'df_linked_kw_800_1000']
print_dfs_lengths(df_names)

In [None]:
# SAVE LINKED KW DFs TO CSV 
df_linked_kw_400.to_csv('data/linked_kw_400.csv', index=False)
df_linked_kw_400_800.to_csv('data/linked_kw_400_800.csv', index=False)
df_linked_kw_800_1000.to_csv('data/linked_kw_800_1000.csv', index=False)

In [None]:
files_to_combine = ['data/linked_kw_400.csv', 'data/linked_kw_400_800.csv', 'data/linked_kw_800_1000.csv']
files_to_combine

In [None]:
df_linked_kw_final = combine_csvs_in_df(files_to_combine)
df_linked_kw_final.tail()

In [None]:
# save to csv
df_linked_kw_final.to_csv('data/linked_kw_final.csv', index=False)

In [20]:
# reload from csv
df_linked_kw_final = pd.read_csv('data/linked_kw_final.csv', header=0)

In [24]:
len(df_linked_kw_final)

19990

In [21]:
df_linked_kw_final.tail()

Unnamed: 0,kw,rank,link
19985,bike bag,16,https://builtbyswift.com/
19986,bike bag,17,https://frostriver.com/collections/cycling-bik...
19987,bike bag,18,https://www.basil.com/en/bicycle-bags/
19988,bike bag,19,https://topodesigns.com/products/bike-bag
19989,bike bag,20,https://www.duluthpack.com/collections/bike-bags


In [22]:
kw_counts = df_linked_kw_final['kw'].value_counts()
kw_less_than_20 = kw_counts[kw_counts < 20].index.tolist()

In [23]:
# this is the one with only ten links

print(kw_less_than_20)

['xel 3a cardholder cases']
