In [None]:
'''Install the necessary libraries required for crawling google pages'''
!pip install urllib3
!pip install bs4

In [None]:
'''Importing necessary libraries with exception handling'''
try:
    from urllib.parse import quote
except ImportError:
    print("No module named 'quote' found")
try:
    from bs4 import BeautifulSoup
except ImportError:
    print("No module named 'bs4' found")
try:
    import time
except ImportError:
    print("No module named 'time' found")
try:
    import json
except ImportError:
    print("No module named 'json' found")
try:
    import os
except ImportError:
    print("No module named 'os' found")
try:
    import requests
except ImportError:
    print("No module named 'requests' found")

In [None]:
def search(search_term, num_results=10, lang="en", proxy=None):
    '''
    This function will define the search method to fetch the links with description
    and Google's Knowledge Panel content in a list
    
    :param search_item: contains the query string
    :param num_results: number of results to fetch
    :param lang: language to be search for results
    :param proxy: proxies if any for filter
    
    :result: dictionary {query:str, top_urls:list[dict], context:list}
    '''
    ### Define the user agent for crawling the google.com
    usr_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    page_links = {"query": search_term,"top_urls":[], "context":[]}

    
    ### Make the url to fetch search results
    escaped_search_term = quote(search_term)
    google_url = 'https://www.google.com/search?q={}&num={}&hl={}&oq={}'.format(escaped_search_term, num_results+1, lang, escaped_search_term)
    print(google_url)
    
    ### Adding proxies to search results as filter
    proxies = None
    if proxy:
        if proxy[:5]=="https":
            proxies = {"https":proxy} 
        else:
            proxies = {"http":proxy}
    

    ### Response from google search results
    response = requests.get(google_url, headers=usr_agent, proxies=proxies)    
    if response.status_code == 429:
        try:
            time.sleep(int(response.headers["Retry-After"]))
        except Exception as e:
            pass
        return page_links
    response.raise_for_status()
    time.sleep(3)
    
    ### Define beautifulsoup object for extracting content
    soup = BeautifulSoup(response.text, 'html.parser')


    ### Div to find the Google's Knowledge Panel
    # 'span', attrs = {'class': 'w8qArf'}
    # 'span',  attrs = {'class': 'kno-fv'}
    context_div = soup.find('div', attrs = {'class': 'I6TXqe'})
    if context_div is not None:
        desc_div = context_div.find('div', attrs = {'id': 'kp-wp-tab-overview'})
        if desc_div is not None:
            page_links["context"] = desc_div.find_all('div', attrs = {'class': 'wDYxhc'})
            


    ### Div to find the top urls
    result_block = soup.find_all('div', attrs={'class': 'g'})

    ### Parse the results and store in dictionary
    for result in result_block:

        ### Parse the description
        desc = result.find('div', attrs={'class': 'IsZvec'})
        description = ""
        if desc is not None:
            description = desc.text

        ### Parse the urls
        link = result.find('a', href=True)
        title = result.find('h3')
        if link and title:
            page_links["top_urls"].append({"url":link['href'], "description": description})

    return page_links

In [None]:
### Setup the input file to read the company names and do google search
input_path = "../input/googleset12/"
set_no = 12
file_ = input_path + "set-"+ str(set_no) +".json"

### Define google search parameters
n_links = 10
language = "en"
proxy = None

### Check input file path
if os.path.exists(input_path):
    
    ### Output file for writing dict output
    with open("./set-"+ str(set_no) +".txt", "a") as txt_fp:
        
        ### Load company information
        data = {}
        print("Processing: "+file_)
        with open(file_, "r") as fp:
            data = json.load(fp)

        ### For each query do google search
        start_index = 0 # starting index of query list
        c_count = 0 # count of the query in a list
        for cin, query in data.items():
            query = query.title() # apply camel case to query
            c_count += 1
            if c_count <= start_index:
                continue
            
            ### Fetch results by google search
            results = search(query, num_results = n_links, lang = language, proxy = proxy)
            print("Count: "+str(c_count)+"\tQuery: "+ query)
            print("Top_urls: "+str(len(results['top_urls']))+"\tContext: "+str(len(results['context'])))
            
            ### Write dict output to file
            txt_fp.write(str(results)+"\n")
else:
    print("../input/cin-set-x folder doesn't exists")

