https://github.com/abenassi/Google-Search-API/blob/master/requirements.py

In [None]:
import os
import re
import nltk
import datetime
import numpy as np
from PIL import Image
from google import google
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords

nltk.download('stopwords')
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Creation du dossier wordcloud_images

In [None]:
if not os.path.exists('wordcloud_images'):
    os.makedirs('wordcloud_images')

## Functions

In [None]:
def get_search_results(keyword, num_page = 1):
    search_results_descriptions = []
    search_results = google.search(keyword, num_page)
    
    for result in search_results:
        search_results_descriptions.append(result.description)
    
    return search_results_descriptions

In [None]:
def get_clean_string_from_search(keyword,
                                 num_page,
                                 additionnal_stop_words_list=["\xa0"],
                                 lang="french"):
    
    # Get resulst of the search
    search_results = get_search_results(keyword, num_page)
    
    # Clean punctuation
    long_string = " ".join(search_results) # transform corpus into a long string
    long_string_clean = re.sub('\xa0', ' ', long_string)
    long_string_clean = re.sub(r'[^\w\s]', ' ', long_string_clean)
    long_string_clean = re.sub('    ', ' ', long_string_clean)
    long_string_clean = re.sub('   ', ' ', long_string_clean)
    long_string_clean = re.sub('  ', ' ', long_string_clean)
    long_string_clean = long_string_clean.lower()

    # Transform long string to a list of words
    search_words_list = long_string_clean.split(' ')

    # Drop stop words
    stop_words = stopwords.words(lang) +\
                     additionnal_stop_words_list +\
                     keyword.split(" ") + [""]

    for word in list(stop_words):
        search_words_list_stop = [word for word in search_words_list 
                                  if word not in stop_words]
    
    
    return " ".join(search_words_list_stop)

In [None]:
def get_file_path(keyword, num_page, folder="./wordcloud_images"):

    name = re.sub(' ', '_', keyword) + "_p" + str(num_page)
    now = datetime.datetime.now().strftime("%m_%d_%Y_%Hh%Mm%Ss")

    file_name = name + "_" + now +".png"
    file_path = os.path.join(folder, file_name)
    return file_path

In [None]:
def make_wordcloud(keyword,
                   num_page,
                   mask,
                   additionnal_stop_words_list,
                   lang="french"):

    long_string_clean = get_clean_string_from_search(keyword, 
                                                     num_page,
                                                     additionnal_stop_words_list)
    file_path = get_file_path(keyword, num_page)
    
    if mask :
        path=os.getcwd()
        mask = np.array(Image.open(os.path.join(path,"mask", mask)))

    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white",
                          max_words=5000, 
                          contour_width=3, 
                          contour_color='steelblue',
                          width=2400,
                          height=1200,
                          mask=mask)

    # Generate a word cloud
    wordcloud.generate(long_string_clean)

    # Visualize the word cloud
    wordcloud.to_file(file_path)
    wordcloud.to_image()
    
    return wordcloud

In [None]:
def plot_wordcloud(cloud):
    
    cloud_array = cloud.to_array()
    
    plt.figure(figsize=(20,10))
    plt.axis("off")
    plt.imshow(cloud_array)
    plt.show()

In [None]:
def display_wordcloud(word, additionnal_stop_words_list, num_page=1, mask=None):

    cloud = make_wordcloud(keyword=word, 
                   num_page=num_page, 
                   additionnal_stop_words_list=additionnal_stop_words_list,
                   mask=mask)

    plot_wordcloud(cloud)

In [None]:
display_wordcloud(word="xebia",
                 additionnal_stop_words_list=["\xa0", "publicis", "sapient"],
                 num_page=2)

In [None]:
display_wordcloud(word="publicis sapient engineering",
                 additionnal_stop_words_list=["\xa0", "xebia"],
                 num_page=2)

In [None]:
display_wordcloud(word="cabinet conseil intelligence artificielle",
                 additionnal_stop_words_list=["\xa0"],
                 num_page=2)

In [None]:
display_wordcloud(word="reinforcement learning france",
                 additionnal_stop_words_list=["\xa0"],
                 num_page=2)

In [None]:
display_wordcloud(word="natural language processing en france",
                 additionnal_stop_words_list=["\xa0"],
                 num_page=2)