### This workbook returns a website URL and associated Google Search rank for a list of input keywords using the serper.dev Google Search API

In [18]:
import requests
import json
import pandas as pd
from typing import List, Optional
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)

### Functions

In [19]:
def load_api_key(key_path: str='secret/key.txt', key_name: str='api_key') -> None:
    key_name = str(key_name)
    with open(key_path, 'r') as f:
        key_value = f.read().strip()
    globals()[key_name] = key_value

    print(f"API key set to global varible '{key_name}'")

In [20]:
def search_kw(kw: str, page: int = 1, url: str = "https://google.serper.dev/search") -> requests.Response:
    """
    Searches for a given keyword using the Serper API and returns the response object.

    Args:
        kw (str): The keyword to search for.
        page (int, optional): The page number of the search results. Defaults to 1.
        url (str, optional): The serper.dev API endpoint. Defaults to "https://google.serper.dev/search".

    Returns:
        requests.Response: The response object containing the search results.
    """

    payload = json.dumps({
      "q": kw, # keyword to search for
      "gl": "us", # location
      "hl": "en", # language
      "autocorrect": False,
      "page": page
    })

    headers = {
      "X-API-KEY": api_key,
      "Content-Type": "application/json"
    }

    response = requests.request("POST", url=url, headers=headers, data=payload)
    return response

In [21]:
def print_response(response: requests.Response) -> None:
    """
    Prints the search results from the serper.dev API response object.

    Args:
        response (requests.Response): The response object containing the search results.

    Returns:
        None
    """

    search_term = response.json()['searchParameters']['q']
    print(f"Results for keyword: \"{search_term}\"")
    
    for rank, result in enumerate(response.json()["organic"], 1):
        title = result['title']
        link = result['link']
        print(f"Rank: {rank}, Title: {title}, Link: {link}")

In [22]:
def get_pages(page_count: int, kw: str, url: str = "https://google.serper.dev/search") -> List[requests.Response]:
    """
    Retrieves the specified number of pages of search results for a given keyword using the Serper API.

    Args:
        page_count (int): The number of pages of search results to retrieve.
        kw (str): The keyword to search for.
        url (str, optional): The Serper API endpoint. Defaults to "https://google.serper.dev/search".

    Returns:
        List[requests.Response]: A list of response objects, where each object contains the search results for a single page.
    """

    pages = []
    for page in range(1, page_count+1):
        pages.append(search_kw(kw, page, url))
    return pages

In [23]:
def extract_response(response: requests.Response) -> pd.DataFrame:
    """
    Extracts the search results from a Serper API response object and returns them as a pandas DataFrame.

    Args:
        response (requests.Response): The response object containing the search results.

    Returns:
        pd.DataFrame: A DataFrame containing the search results, with columns for the search term (kw), 
        page number (page), rank of the result on the page (rank), and the link to the result (link).
    """

    df = pd.DataFrame(columns = ['kw', 'page', 'rank', 'link'])
    q = response.json()['searchParameters']['q']
    page = response.json()['searchParameters']['page']
    for rank, result in enumerate(response.json()["organic"], 1):
        link = result['link']
        df.loc[len(df)] = [q, page, rank, link]
    return df

In [24]:
def collate_pages(pages: List[requests.Response], max_rank: Optional[int] = None) -> pd.DataFrame:
    """
    Collates the search results from a list of serper.decv API response objects and returns them as a pandas DataFrame.

    Args:
        pages (List[requests.Response]): A list of response objects, where each object contains the search results for a single page.
        max_rank (int, optional): The maximum rank of search results to include in the output DataFrame. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing the collated search results, with columns for the search term (kw), 
        rank of the result (rank), and the link to the result (link).
    """

    df = pd.DataFrame()
    for response in pages:
        data = extract_response(response)
        df = pd.concat([df, data], ignore_index=True)
    df = df.reset_index(drop=True)
    df["rank"] = df.index + 1
    if max_rank is not None:
        df = df[df['rank'] <= max_rank]
    df.drop(["page"], axis=1, inplace=True)
    return df

In [25]:
def get_links_for_kws(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Gets search results for a list of keywords and returns them as a pandas DataFrame.

    Args:
        df_in (pd.DataFrame): A DataFrame containing a column of keywords under the column name "Keyword".

    Returns:
        pd.DataFrame: A DataFrame containing the collated search results for each keyword, with columns for the 
        search term (kw), rank of the result (rank), and the link to the result (link).
    """

    df_out = pd.DataFrame()
    keywords = df_in["kw"]
    for kw in keywords:
        kw_pages = get_pages(3, kw) # get 3 pages (<= 30 records) for each kw
        df = collate_pages(kw_pages, 20) # collate pages, limit to top 20 records.
        df_out = pd.concat([df_out, df], ignore_index=True)
    return(df_out)

In [26]:
def combine_csvs_in_df(files: List[str]) -> pd.DataFrame:
    """
    Combines multiple CSV files into a single DataFrame.

    Args:
        files_to_combine (List[str]): A list of file paths for the CSV files to be combined.

    Returns:
        pd.DataFrame: A DataFrame containing the combined data from all the input CSV files.
    """

    combined_df = pd.DataFrame()
    for file in files:
        df = pd.read_csv(file, header=0)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

In [46]:
def print_dfs_lengths(df_names: List[str]) -> int:
    """
    Prints the length of each DataFrame in a list of DataFrame names and returns the total length.

    Args:
        df_names (List[str]): A list of strings representing the names of the DataFrames to be printed.

    Returns:
        int: The total length of all the DataFrames.
    """

    # Get the length of each DataFrame
    try:
        lengths = [len(globals()[df_name]) for df_name in df_names]
        
        # Print the length of each DataFrame
        for i, length in enumerate(lengths):
            print(f"DataFrame {i+1} ({df_names[i]}): {length}")
        
        # Calculate and print the total length of all the DataFrames
        total_length = sum(lengths)
        print(f"Total length: {total_length}")
        
        return total_length
    except:
        print("There was a problem")

### Code Execution Examples

In [28]:
load_api_key(key_path='secret/key.txt', key_name='api_key')

API key set to global varible 'api_key'


In [29]:
# load data/kw_1_000.csv

df_in = pd.read_csv('data/kw_1_000.csv')
print(f"df_in has {len(df_in)} records")
df_in.head(3)

df_in has 1000 records


Unnamed: 0,kw
0,stone for fireplace
1,country french decor
2,infant fever


In [30]:
# example of search for 1 kw and 1 page
response = search_kw(kw='space', page=1)
response

<Response [200]>

In [31]:
print_response(response)

Results for keyword: "space"
Rank: 1, Title: Space.com, Link: https://www.space.com/
Rank: 2, Title: Space news, articles and features | New Scientist, Link: https://www.newscientist.com/subject/space/
Rank: 3, Title: NASA, Link: https://www.nasa.gov/
Rank: 4, Title: Webb Image Release- Webb Space Telescope GSFC/NASA, Link: https://webb.nasa.gov/
Rank: 5, Title: SpaceX, Link: https://www.spacex.com/


In [32]:
#extract response into df

extract_response(response)

Unnamed: 0,kw,page,rank,link
0,space,1,1,https://www.space.com/
1,space,1,2,https://www.newscientist.com/subject/space/
2,space,1,3,https://www.nasa.gov/
3,space,1,4,https://webb.nasa.gov/
4,space,1,5,https://www.spacex.com/


In [33]:
response2 = get_pages(page_count=2, kw='nasa')
response2

[<Response [200]>, <Response [200]>]

In [34]:
collate_pages(response2)

Unnamed: 0,kw,rank,link
0,nasa,1,https://www.nasa.gov/
1,nasa,2,https://www.youtube.com/nasa
2,nasa,3,https://en.wikipedia.org/wiki/NASA
3,nasa,4,https://www.instagram.com/nasa/?hl=en
4,nasa,5,https://twitter.com/NASA
5,nasa,6,https://facebook.com/NASA/
6,nasa,7,https://www.linkedin.com/company/nasa
7,nasa,8,https://spacecenter.org/
8,nasa,9,https://apps.apple.com/us/app/nasa/id334325516
9,nasa,10,https://soundcloud.com/nasa


In [35]:
df_in[:1]

Unnamed: 0,kw
0,stone for fireplace


In [36]:
# attempts to get 20 links per kw

response3 = get_links_for_kws(df_in[:1])
response3

Unnamed: 0,kw,rank,link
0,stone for fireplace,1,https://www.amazon.com/Fireplace-Stone-Veneer/...
1,stone for fireplace,2,https://www.lowes.com/pl/Stone-veneer-Stone-ve...
2,stone for fireplace,3,https://stoneyard.com/choosing-the-perfect-sto...
3,stone for fireplace,4,https://www.eldoradostone.com/imagine/fireplaces/
4,stone for fireplace,5,https://www.pinterest.com/stoneyardcom/natural...
5,stone for fireplace,6,https://www.pinterest.com/buechelstone/design-...
6,stone for fireplace,7,https://www.homedepot.com/b/Flooring-Tile-Natu...
7,stone for fireplace,8,https://firesidehearth.com/products/cultured-s...
8,stone for fireplace,9,https://www.bhg.com/decorating/fireplace/style...
9,stone for fireplace,10,https://www.homedepot.com/b/Flooring-Tile-Natu...


In [37]:
df_2 = df_in.iloc[0:2]
df_linked_kw_out_2 = get_links_for_kws(df_2) # DOES 6 QUERIES (3 pages for each)
df_linked_kw_out_2

Unnamed: 0,kw,rank,link
0,stone for fireplace,1,https://www.amazon.com/Fireplace-Stone-Veneer/...
1,stone for fireplace,2,https://www.lowes.com/pl/Stone-veneer-Stone-ve...
2,stone for fireplace,3,https://stoneyard.com/choosing-the-perfect-sto...
3,stone for fireplace,4,https://www.eldoradostone.com/imagine/fireplaces/
4,stone for fireplace,5,https://www.pinterest.com/stoneyardcom/natural...
5,stone for fireplace,6,https://www.pinterest.com/buechelstone/design-...
6,stone for fireplace,7,https://genstone.com/insights/design-ideas/fau...
7,stone for fireplace,8,https://www.champlainstone.com/projects/firepl...
8,stone for fireplace,9,https://www.thespruce.com/stunning-stone-firep...
9,stone for fireplace,10,https://www.norstoneusa.com/gallery/applicatio...


### Code Execution

In [38]:
load_api_key(key_path='secret/key.txt', key_name='api_key')

API key set to global varible 'api_key'


In [39]:
# load data/kw_1_000.csv

df_in = pd.read_csv('data/kw_1_000.csv')
print(f"df_in has {len(df_in)} records")
df_in.head(3)

df_in has 1000 records


Unnamed: 0,kw
0,stone for fireplace
1,country french decor
2,infant fever


In [40]:
# spliting the data into batches of 400 or less
df_400 = df_in.iloc[0:400]
df_400_800 = df_in.iloc[400:800]
df_800_1000 = df_in.iloc[800:]

In [41]:
df_400_800.head()

Unnamed: 0,kw
400,foods for inflammation
401,ps2 controller
402,eating popcorn
403,benefits of not drinking alcohol
404,japanese tree


In [42]:
# ***DANGER*** GETS DATA FROM SERPER.API
# REMEMBER YOU ARE LIMITED BY A DAILY QUERY QUOTA OF 1440
# UNCOMMENT TO RUN

#df_linked_kw_out_400 = get_links_for_kws(df_400) # DOES 1200 QUERIES (3 pages for each)
#df_linked_kw_out_400_800 = get_links_for_kws(df_400_800) # DOES 1200 QUERIES (3 pages for each)
#df_linked_kw_out_800_1000 = get_links_for_kws(df_800_1000) # DOES 600 QUERIES (3 pages for each)

In [47]:
df_names = ['df_linked_kw_400', 'df_linked_kw_400_800', 'df_linked_kw_800_1000']
print_dfs_lengths(df_names)

There was a problem


In [None]:
# SAVE LINKED KW DFs TO CSV 
df_linked_kw_400.to_csv('data/linked_kw_400.csv', index=False)
df_linked_kw_400_800.to_csv('data/linked_kw_400_800.csv', index=False)
df_linked_kw_800_1000.to_csv('data/linked_kw_800_1000.csv', index=False)

In [48]:
files = ['data/linked_kw_400.csv', 'data/linked_kw_400_800.csv', 'data/linked_kw_800_1000.csv']
files

['data/linked_kw_400.csv',
 'data/linked_kw_400_800.csv',
 'data/linked_kw_800_1000.csv']

In [49]:
df_linked_kw_final = combine_csvs_in_df(files)
df_linked_kw_final.tail()

Unnamed: 0,kw,rank,link
19985,bike bag,16,https://builtbyswift.com/
19986,bike bag,17,https://frostriver.com/collections/cycling-bik...
19987,bike bag,18,https://www.basil.com/en/bicycle-bags/
19988,bike bag,19,https://topodesigns.com/products/bike-bag
19989,bike bag,20,https://www.duluthpack.com/collections/bike-bags


In [50]:
# save to csv
df_linked_kw_final.to_csv('data/linked_kw_final.csv', index=False)

In [51]:
# reload from csv
df_linked_kw_final = pd.read_csv('data/linked_kw_final.csv', header=0)
print(f"df_linked_kw_final has {len(df_linked_kw_final)} records")
df_linked_kw_final.head(3)

df_linked_kw_final has 19990 records


Unnamed: 0,kw,rank,link
0,side table lamps,1,https://www.target.com/c/table-lamps-lighting-...
1,side table lamps,2,https://www.amazon.com/Side-Table-Lamps/s?k=Si...
2,side table lamps,3,https://www.wayfair.com/lighting/sb1/bedside-t...


In [52]:
len(df_linked_kw_final)

19990

In [53]:
df_linked_kw_final.tail()

Unnamed: 0,kw,rank,link
19985,bike bag,16,https://builtbyswift.com/
19986,bike bag,17,https://frostriver.com/collections/cycling-bik...
19987,bike bag,18,https://www.basil.com/en/bicycle-bags/
19988,bike bag,19,https://topodesigns.com/products/bike-bag
19989,bike bag,20,https://www.duluthpack.com/collections/bike-bags


In [54]:
kw_counts = df_linked_kw_final['kw'].value_counts()
kw_less_than_20 = kw_counts[kw_counts < 20].index.tolist()

In [55]:
# this is the one with only ten links

print(kw_less_than_20)

['xel 3a cardholder cases']
