In [22]:
import requests
import json
import pandas as pd

In [23]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)

In [24]:
# fetch api key from file

with open('secret/key.txt', 'r') as file:
    api_key = file.read()
#api_key

In [25]:
#keywords = ["swollen ankles", "horny goat weed", "scalp psoriasis"]

url = "https://google.serper.dev/search"
location = "Calgary,Alberta,Canada"
language = "en"
device = "desktop"

In [26]:
# peforms a serch on one kw and returns the Xth page of relults

def search_kw(kw, page=1, url="https://google.serper.dev/search"):
    payload = json.dumps({
      "q": kw, # keyword to search for
      "gl": "us", # location
      "hl": "en", # language
      "autocorrect": False,
      "page": page
    })

    headers = {
      "X-API-KEY": api_key,
      "Content-Type": "application/json"
    }

    response = requests.request("POST", url=url, headers=headers, data=payload)
    return(response)

In [27]:
# runs search_kw over multiple pages and returns them in a list

def get_pages(page_count, kw, url="https://google.serper.dev/search"):
    pages = list()
    for page in range(1, page_count+1):
        pages.append(search_kw(kw, page, url))
    return pages

In [28]:
# extract kw, page, rank, and link from 1 page of response info and return df

def extract_response(response):
    df = pd.DataFrame(columns = ['kw', 'page', 'rank','link'])
    q = response.json()['searchParameters']['q']
    page = response.json()['searchParameters']['page']
    for rank, result in enumerate(response.json()["organic"], 1):
        link = result['link']
        df.loc[len(df)] = [q, page, rank, link]
    return(df)

In [29]:
# takes a list of pages of response, iterates extract_response(), extracts and renumbers responses, returns df of ky, rank, link

def collate_pages(pages, max_rank=None):
    df = pd.DataFrame()
    for response in pages:
        data = extract_response(response)
        df = pd.concat([df, data], ignore_index=True)
    df = df.reset_index(drop=True)
    df["rank"] = df.index + 1
    if max_rank is not None:
        df = df[df['rank'] <= max_rank]
    df.drop(["page"], axis=1, inplace=True)
    return df

In [30]:
# takes df containing "Keywords" and iterates collate_pages() returns df of kw

def get_links_for_kws(df_in):
    df_out = pd.DataFrame()
    keywords = df_in["Keyword"]
    for kw in keywords:
        kw_pages = get_pages(3, kw) # get 3 pages (<= 30 records) for each kw
        df = collate_pages(kw_pages, 20) # colate pages, limit to top 20 records.
        df_out = pd.concat([df_out, df], ignore_index=True)
    return(df_out)

In [31]:
# load kw
df_in = pd.read_csv('data/kw_1_000.csv')
#df_in = df_in.head() # FOR TESTING PURPOSES ONLY BECAUSE I AM LIMITED BY MY QUERY QUOTA OF 1440
#df_in.iloc[400:500]

In [50]:
df_400 = df_in.iloc[0:400]
df_400_800 = df_in.iloc[400:800]
df_800_1000 = df_in.iloc[800:]

In [51]:
# ***DANGER*** GETS DATA FROM SERPER.API
# REMEMBER YOU ARE LIMITED BY A DAILY QUERY QUOTA OF 1440

#df_out_400 = get_links_for_kws(df_400) # DOES 1200 QUERIES
#df_out_400_800 = get_links_for_kws(df_400_800) # DOES 1200 QUERIES
#df_out_800_1000 = get_links_for_kws(df_800_1000) # DOES 600 QUERIES

In [46]:
# SAVE kw_first_400.csv

df_out_400.to_csv('data/kw_first_400_out.csv', index=False)

In [47]:
# SAVE kw_400_800.csv

df_out_400_800.to_csv('data/kw_400_800_out.csv', index=False)

In [60]:

df_out_800_1000.head()

Unnamed: 0,kw,rank,link
0,elderberry bush,1,https://www.fs.usda.gov/wildflowers/plant-of-t...
1,elderberry bush,2,https://www.thespruce.com/what-is-elderberry-h...
2,elderberry bush,3,https://www.thespruce.com/american-elderberry-...
3,elderberry bush,4,https://www.starkbros.com/products/berry-plant...
4,elderberry bush,5,https://gardenerspath.com/plants/fruit/best-el...


In [53]:
# SAVE kw_800_1000

df_out_800_1000.to_csv('data/kw_800_1000_out.csv', index=False)

In [71]:
files_to_combine = ['data/kw_first_400_out.csv', 'data/kw_800_1000_out.csv'] # temp version
#files_to_combine = ['data/kw_first_400_out.csv', 'data/kw_400_800_out.csv', 'data/kw_800_1000_out.csv'] # final version
files_to_combine

['data/kw_first_400_out.csv', 'data/kw_800_1000_out.csv']

In [81]:
def combine_csv_files(files_to_combine):
    column_names = ['kw', 'rank', 'link']
    #combined_df = pd.DataFrame(columns=column_names)
    combined_df = pd.DataFrame(columns=column_names)
    for file in files_to_combine:
        df = pd.read_csv(file, header=None)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

In [82]:
df_final = combine_csv_files(files_to_combine)
df_final

Unnamed: 0,0,1,2
0,kw,rank,link
1,side table lamps,1,https://www.target.com/c/table-lamps-lighting-...
2,side table lamps,2,https://www.amazon.com/Side-Table-Lamps/s?k=Si...
3,side table lamps,3,https://www.wayfair.com/lighting/sb1/bedside-t...
4,side table lamps,4,https://www.ikea.com/us/en/cat/table-lamps-10732/
...,...,...,...
11987,bike bag,16,https://builtbyswift.com/
11988,bike bag,17,https://frostriver.com/collections/cycling-bik...
11989,bike bag,18,https://www.basil.com/en/bicycle-bags/
11990,bike bag,19,https://topodesigns.com/products/bike-bag


In [83]:
df_final.to_csv('data/kw_final.csv', index=False)

In [92]:
df_final = pd.read_csv('data/kw_final.csv', header=None, names=['kw', 'rank', 'link'])

In [93]:
df_final

Unnamed: 0,kw,rank,link
0,side table lamps,1,https://www.target.com/c/table-lamps-lighting-...
1,side table lamps,2,https://www.amazon.com/Side-Table-Lamps/s?k=Si...
2,side table lamps,3,https://www.wayfair.com/lighting/sb1/bedside-t...
3,side table lamps,4,https://www.ikea.com/us/en/cat/table-lamps-10732/
4,side table lamps,5,https://www.crateandbarrel.com/lighting/table-...
...,...,...,...
11985,bike bag,16,https://builtbyswift.com/
11986,bike bag,17,https://frostriver.com/collections/cycling-bik...
11987,bike bag,18,https://www.basil.com/en/bicycle-bags/
11988,bike bag,19,https://topodesigns.com/products/bike-bag


In [94]:
kw_counts = df_final['kw'].value_counts()
kw_less_than_20 = kw_counts[kw_counts < 20].index.tolist()

In [97]:
print(kw_less_than_20)

['xel 3a cardholder cases']


In [100]:
df_x = pd.DataFrame({'Keyword': ['xel 3a cardholder cases']})
df_x

Unnamed: 0,Keyword
0,xel 3a cardholder cases


In [101]:
def get_links_for_kws2(df_in):
    df_out = pd.DataFrame()
    keywords = df_in["Keyword"]
    for kw in keywords:
        kw_pages = get_pages(4, kw) # get 3 pages (<= 30 records) for each kw
        df = collate_pages(kw_pages, 20) # colate pages, limit to top 20 records.
        df_out = pd.concat([df_out, df], ignore_index=True)
    return(df_out)

In [102]:
get_links_for_kws2(df_x)

Unnamed: 0,kw,rank,link
0,xel 3a cardholder cases,1,https://www.amazon.co.jp/-/en/Notebook-Smartph...
1,xel 3a cardholder cases,2,https://www.uspto.gov/sites/default/files/trad...
2,xel 3a cardholder cases,3,https://www.infrastructurereportcard.org/wp-co...
3,xel 3a cardholder cases,4,https://www.ojp.gov/pdffiles1/Digitization/472...
4,xel 3a cardholder cases,5,http://www.stern.nyu.edu/~adamodar/pc/datasets...
5,xel 3a cardholder cases,6,https://www.walmart.com/sitemap_itp_03_3426.xm...
6,xel 3a cardholder cases,7,https://www.knightcraft.com/wp-content/uploads...
7,xel 3a cardholder cases,8,https://www.sec.gov/Archives/edgar/data/147126...
8,xel 3a cardholder cases,9,https://web.mit.edu/adamrose/Public/googlelist
9,xel 3a cardholder cases,10,https://coronatestzentrum-celle.de/army-gtrac....


In [105]:
p1 = search_kw('xel 3a cardholder cases', page=1, url="https://google.serper.dev/search")

In [107]:
p2 = search_kw('xel 3a cardholder cases', page=2, url="https://google.serper.dev/search")

In [35]:
# prints the response for 1 keyword

def print_response(response):
    print (f"Results for keyword: \"{response.json()['searchParameters']['q']}\"")
    for rank, result in enumerate(response.json()["organic"], 1):
        print(f"Rank: {rank}, Title: {result['title']}, Link: {result['link']}")

In [106]:
print_response(p1)

Results for keyword: "xel 3a cardholder cases"
Rank: 1, Title: Pixel 3a XL Sumato Case, 3aXL, Card Holder, Gift ... - Amazon.co.jp, Link: https://www.amazon.co.jp/-/en/Notebook-Smartphone-Horizontal-Function-Magnetic/dp/B081Q1HZFW
Rank: 2, Title: [DOC] Case - USPTO, Link: https://www.uspto.gov/sites/default/files/trademarks/Table_of_Fraud_Cases.doc
Rank: 3, Title: [PDF] UTAH's INFRASTRUCTURE REPORT CARD G.P.A. C+, Link: https://www.infrastructurereportcard.org/wp-content/uploads/2015/02/UTAH-REPORT-CARD-BROCHURE-2.18.15-FINAL.pdf
Rank: 4, Title: [PDF] consumer credit protection act! - Office of Justice Programs, Link: https://www.ojp.gov/pdffiles1/Digitization/47227NCJRS.pdf
Rank: 5, Title: [XLS] highpastepsgrowth.xls - NYU Stern, Link: http://www.stern.nyu.edu/~adamodar/pc/datasets/invphil/highpastepsgrowth.xls
Rank: 6, Title: Untitled, Link: https://www.walmart.com/sitemap_itp_03_3426.xml.gz
Rank: 7, Title: [PDF] PCI DSS Compliance for HP NonStop Servers - Knightcraft Technology, Lin

In [108]:
print_response(p2)

Results for keyword: "xel 3a cardholder cases"
Rank: 1, Title: Army Gtrac - Coronatestzentrum Celle, Link: https://coronatestzentrum-celle.de/army-gtrac.html
Rank: 2, Title: Untitled, Link: http://www.all-inkl.com/?sek=link&open=out&u=neyspedevamglosat.gq/143wwwall-inklcomalbm651


In [109]:
df_x2 = extract_response([p1, p2])

AttributeError: 'list' object has no attribute 'json'