# スクレイピング関数の定義

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import itertools
import time
from tqdm import notebook as tqdm
import numpy as np
import plotly.express as px
import random

def get_html_doc(_url):
  BASE_URL = "https://www.google.com/search"

  # 国：米国⇒「us」、日本⇒「jp」、英国⇒「uk」、フランス⇒「fr」
  GL_URL = "gl=jp"
  
  # 言語：英語⇒「en」、日本語⇒「ja」、ドイツ語⇒「de」・・・
  HL_URL = "hl=en"

  HEADERS = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
  }
  url = BASE_URL + "?"+ GL_URL + "&" + HL_URL + "&" + _url
  print(url)
  html_doc = requests.get(url, headers=HEADERS, stream=True).content
  print(html_doc)
  return html_doc

def find_result_stats(_html_doc):
  soup = BeautifulSoup(_html_doc, "html.parser")
  tag = soup.find("div", {"id": "result-stats"})
  if tag == None:
      tag = "About 0 results"
  print("tag=")
  print(tag)
  return tag

def extract_hit_count(_text):
  text = _text
  text = text.replace('About ', '')
  text = text.replace('results', 'result')
  text = re.search('(.+) result', text).group(1)
  text = text.replace(',', '')
  return int(text)

def get_search_hit_count(keyword, year):
    # キーワード検索
    q_url = "q=%22" + keyword.replace(" ", "+") + "%22"
    # 期間指定
    tbs_url = "tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F" + str(year) + "%2Ccd_max%3A12%2F31%2F" + str(year)
    try:
      _url = q_url + "&" + tbs_url
      html_doc = get_html_doc(_url)
      tag = find_result_stats(html_doc)
      hit_count = extract_hit_count(tag.text)
    except Exception as e:
      print("例外args:", e.args)
      hit_count = 0
    return hit_count

def get_search_hit_countlist(_keyword_list, _fromyear, _toyear):
  columns = ["keyword", "year", "hit_count"]
  df = pd.DataFrame(columns=columns)

  year_list = list(range(_fromyear, _toyear + 1))
  print(year_list)
  for_list = [item for item in itertools.product(_keyword_list, year_list)]

  for keyword, year in tqdm.tqdm(for_list):
      print(keyword, year)
      hit_count = get_search_hit_count(keyword, year)
      sr = pd.Series([keyword, year, hit_count], index=df.columns)
      df = df.append(sr, ignore_index=True)
      _sleeptime = 20 + random.randrange(10) + random.randrange(10) + random.randrange(10) + random.randrange(10)
      print("wait " + str(_sleeptime) + "[sec]")
      time.sleep(_sleeptime)

  return df

# Google ドライブのマウント

In [None]:
# Googleドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

import os
# Colab Notebooksへの相対パスを定義
_colab_dir = "./drive/MyDrive/Colab Notebooks"
# カレントディレクトリの確認
print(os.getcwd())
# Colab Notebooksのファイル一覧を取得。Colab Notebooksにアクセス出来ていることを確認
print(os.listdir(_colab_dir))

# メイン処理

## スクレイピング関数の実行・DataFrameの取得

In [None]:
# ===============
# 検索条件の指定
# ===============
# 検索キーワードリスト
p_keyword_list = [
    "AI",
    "ビッグデータ",
    "ディープラーニング",
    "gpgpu",
    "cuDNN"
]
# 検索開始年
p_from = 2015
# 検索終了年
p_to = 2020

# ===============
# スクレイピング関数の実行・DataFrameの取得
# ===============
df = get_search_hit_countlist(p_keyword_list, p_from, p_to)

## DataFrameの可視化

In [None]:
from plotly.offline import iplot, plot, download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go

# 可視化
fig = px.line(df, x="year", y="hit_count", color='keyword',log_y=True)
fig.show()

## 可視化結果をhtmlファイルとして保存

In [None]:
# 保存
import datetime
time = datetime.datetime.now()
_filename = time.strftime('%Y%m%d%H%M%S')
plot(fig, filename=_colab_dir + "/" + _filename + ".html", auto_open=False)