In [253]:
import pytrends
import webbrowser, os
import time
import requests
import numpy as np

import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
from pytrends.request import TrendReq

In [203]:
'''
term: a length 1 array containing a string to be searched
years: an array of strings containing years to be searched
'''
def get_trends(term, years, pytrends=TrendReq()):
    # get results and remove incomplete entries
    timeframe = years[0] + '-01-01 ' + years[-1] + '-12-31'
    
    # get dataframe and re-format to numpy
    pytrends.build_payload(term, timeframe=timeframe)
    df = pytrends.interest_over_time()
    df = df.drop(df[df.isPartial].index)
    np_data = df.drop('isPartial', axis=1).to_numpy()
    
    # wait 5 seconds so google doesnt block me
    time.sleep(5)
    
    # split the data into years, and get the max of each year
    # basically, return an array with the highest trend score in that year
    
    return [np.amax(x.flatten()) for x in np.split(np_data,len(years))]

In [204]:
'''
term: a length 1 array containing a string to be searched
years: an array of strings containing years to be searched
'''
def get_dblp(term, years):
    # setup query variables and result array
    api_base = 'https://dblp.org/search/publ/api'
    result = []
    
    search_term = '?q=' + term[0].replace(' ', '+') + '+'
    term_result = []
    
    for year in years:
        year_term = 'year:' + year + ':'
        # make request and extract total number of hits
        html = requests.get(api_base + search_term + year_term).text
        result.append(ET.fromstring(html).find('./hits').attrib['total'])
        
    
    # conver to integer and normalize on 1-100 (like google trends)
    result = [int(i) for i in result]
    max_r = max(result)
    normalized_result = [round(i/max_r * 100) for i in result]
    return normalized_result

In [205]:
def scrape_data(terms, years):
    # setup variables
    trends_res = []
    dblp_res = []
    
    # make API/Scrape requests, and collect in array
    for term in tqdm(terms): 
        trends_res.append(get_trends(term, years))
        dblp_res.append(get_dblp(term, years))
    return trends_res, dblp_res

In [206]:
terms = [['machine learning'],['artificial intelligence']]
years=[str(2010+i) for i in range(11)]

trends_data, dblp_data = scrape_data(terms, years)
difference = np.subtract(trends_data, dblp_data).tolist()

  0%|          | 0/2 [00:00<?, ?it/s]

In [269]:
print(trends_data)
print(dblp_data)

[[9, 12, 12, 14, 18, 28, 46, 80, 91, 97, 100], [36, 44, 35, 32, 38, 40, 53, 95, 100, 97, 95]]
[[20, 8, 10, 11, 14, 18, 24, 47, 53, 79, 100], [17, 17, 17, 20, 18, 19, 19, 24, 41, 68, 100]]


In [335]:
def get_chart1_dataset(i):
    trends_dset = {
        'label': "Google Trends",
        'data': [int(x) for x in trends_data[i]],
        'borderColor': 'rgba(255,0,0,1)',
        'backgroundColor': 'rgba(255,0,0,0.5)',
        'fill': True
    }
    dblp_dset = {
        'label': "DBLP",
        'data': [int(x) for x in dblp_data[i]],
        'borderColor': 'rgba(0,0,255,1)',
        'backgroundColor': 'rgba(0,0,255,0.5)',
        'fill': True
    } 
    return [trends_dset, dblp_dset]

In [336]:
chart1 = []
i=0

for term in terms:
    data = {
        'name': term[0],
        'labels': years,
        'datasets': get_chart1_dataset(i)
    }
    chart1.append(data)
    i+=1
    
import json

with open('html/data.js', 'w', encoding='utf-8') as f:
    f.write('const data_c1 = ')
    json.dump(chart1, f, ensure_ascii=False, indent=4)
    f.write(';\n')

In [337]:
chart2 = []
i=0

for term in terms:
    data = {
        'name': term[0],
        'labels': years,
        'datasets': [{
            'label': term[0],
            'data': [int(x) for x in difference[i]],
            'borderColor': 'rgba(255,0,255,1)',
            'backgroundColor': 'rgba(255,0,255,0.5)',
            'fill': {
                'above': 'blue',
                'below': 'red',
                'target': {'value': 0},
            },
        }],
    }
    chart2.append(data)
    i+=1

In [338]:
with open('html/data.js', 'a', encoding='utf-8') as f:
    f.write('const data_c2 = ')
    json.dump(chart2, f, ensure_ascii=False, indent=4)
    f.write(';\n')

In [339]:
sum_score = np.add(trends_data, dblp_data).tolist()

In [340]:
def get_chart3_dataset():
    colors = ['#301A4B', '#6DB1BF', '#FFEAEC', '#F39A9D', '#3F6C51']
    datasets = []
    i=0
    for term in terms:
        dataset = {
            'label': term[0],
            'data': [int(x) for x in sum_score[i]],
            'backgroundColor': colors[i % (len(colors)-1)],
        }
        i+=1
        datasets.append(dataset)
    return datasets

In [341]:

chart3 = []

data = {
    'name': term[0],
    'labels': years,
    'datasets': get_chart3_dataset()
}
chart3.append(data)

In [342]:
with open('html/data.js', 'a', encoding='utf-8') as f:
    f.write('const data_c3 = ')
    json.dump(chart3, f, ensure_ascii=False, indent=4)
    f.write(';\n')

In [332]:
# rotate array to be 2017: trends, dblp
trends_data_yearly = np.swapaxes(trends_data, 0, 1)
dblp_data_yearly = np.swapaxes(dblp_data, 0, 1)