In [1]:
import pytrends
import webbrowser, os
import time
import requests
import numpy as np
import json

import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
from pytrends.request import TrendReq

In [2]:
'''
term: a length 1 array containing a string to be searched
years: an array of strings containing years to be searched
'''
def get_trends(term, years, pytrends=TrendReq()):
    # get results and remove incomplete entries
    timeframe = years[0] + '-01-01 ' + years[-1] + '-12-31'
    
    # get dataframe and re-format to numpy
    pytrends.build_payload(term, timeframe=timeframe)
    df = pytrends.interest_over_time()
    df = df.drop(df[df.isPartial].index)
    np_data = df.drop('isPartial', axis=1).to_numpy()
    
    # wait 5 seconds so google doesnt block me
    time.sleep(5)
    
    # split the data into years, and get the max of each year
    # basically, return an array with the highest trend score in that year
    
    return [np.amax(x.flatten()) for x in np.split(np_data,len(years))]

In [3]:
'''
term: a length 1 array containing a string to be searched
years: an array of strings containing years to be searched
'''
def get_dblp(term, years):
    # setup query variables and result array
    api_base = 'https://dblp.org/search/publ/api'
    result = []
    
    search_term = '?q=' + term[0].replace(' ', '+') + '+'
    term_result = []
    
    for year in years:
        year_term = 'year:' + year + ':'
        # make request and extract total number of hits
        html = requests.get(api_base + search_term + year_term).text
        result.append(ET.fromstring(html).find('./hits').attrib['total'])
        
    
    # conver to integer and normalize on 1-100 (like google trends)
    result = [int(i) for i in result]
    max_r = max(result)
    normalized_result = [round(i/max_r * 100) for i in result]
    return normalized_result

In [4]:
def scrape_data(terms, years):
    # setup variables
    trends_res = []
    dblp_res = []
    
    # make API/Scrape requests, and collect in array
    for term in tqdm(terms): 
        trends_res.append(get_trends(term, years))
        dblp_res.append(get_dblp(term, years))
    return trends_res, dblp_res

In [5]:
def get_chart1_dataset(i, trends_data, dblp_data):
    trends_dset = {
        'label': "Google Trends",
        'data': [int(x) for x in trends_data[i]],
        'borderColor': 'rgba(255,0,0,1)',
        'backgroundColor': 'rgba(255,0,0,0.5)',
        'fill': True
    }
    dblp_dset = {
        'label': "DBLP",
        'data': [int(x) for x in dblp_data[i]],
        'borderColor': 'rgba(0,0,255,1)',
        'backgroundColor': 'rgba(0,0,255,0.5)',
        'fill': True
    } 
    return [trends_dset, dblp_dset]

In [6]:
def processC1(terms, years, trends_data, dblp_data):
    chart1 = []
    i=0

    for term in terms:
        data = {
            'name': term[0],
            'labels': years,
            'datasets': get_chart1_dataset(i, trends_data, dblp_data)
        }
        chart1.append(data)
        i+=1

    return chart1

In [7]:
def processC2(terms, years, difference):
    chart2 = []
    i=0

    for term in terms:
        data = {
            'name': term[0],
            'labels': years,
            'datasets': [{
                'label': term[0],
                'data': [int(x) for x in difference[i]],
                'borderColor': 'rgba(255,0,255,1)',
                'backgroundColor': 'rgba(255,0,255,0.5)',
                'fill': {
                    'above': 'blue',
                    'below': 'red',
                    'target': {'value': 0},
                },
            }],
        }
        chart2.append(data)
        i+=1
    
    return chart2

In [8]:
def get_chart3_dataset(terms, sum_score):
    colors = ['#301A4B', '#6DB1BF', '#FFEAEC', '#F39A9D', '#3F6C51']
    datasets = []
    i=0
    for term in terms:
        dataset = {
            'label': term[0],
            'data': [int(x) for x in sum_score[i]],
            'backgroundColor': colors[i % (len(colors)-1)],
        }
        i+=1
        datasets.append(dataset)
    return datasets

In [9]:
def processC3(terms,years, sum_score):
    chart3 = []
    data = {
        'labels': years,
        'datasets': get_chart3_dataset(terms, sum_score)
    }
    chart3.append(data)
    
    return chart3

In [10]:
def get_colors(number_list):
    # largest to smallest
    negative_c = ['#F6BDC0', '#F1959B', '#F07470', '#EA4C46', '#DC1C13'][::-1]
    positive_c = ['#A3A3FF', '#7879FF', '#4949FF', '#1F1FFF', '#0000FF'][::-1]
    positives = 0
    negatives = 0
    colorArray = [''] * len(number_list)
    
    for index in np.argsort(number_list):
        if(number_list[index] < 0):
            #print("Setting Color of " + str(number_list[index]) + " to " + negative_c[negatives % len(negative_c)])
            colorArray[index] = negative_c[negatives % len(negative_c)]
            negatives +=1
        else:
            #print("Setting Color of " + str(number_list[index]) + " to " + positive_c[positives % len(positive_c)])
            colorArray[index] = positive_c[positives % len(positive_c)]
            positives +=1
    
    return colorArray

In [11]:
def processC4(terms, years, difference_yearly):
    chart4 = []

    i=0
    for year in years:
        data = {
            'name': str(year),
            'labels': np.asarray(terms).flatten().tolist(),
            'datasets': [{
                'label': str(year),
                'data': [int(x) for x in difference_yearly[i]],
                'backgroundColor': get_colors(difference_yearly[i])
            }]
        }
        chart4.append(data)
        i+=1
    
    return chart4

In [12]:
def process_data(terms, years, trends_data, dblp_data):
    difference = np.subtract(trends_data, dblp_data).tolist()
    sum_score = np.add(trends_data, dblp_data).tolist()
    difference_yearly = np.swapaxes(difference, 0, 1)
    
    chart1 = processC1(terms, years, trends_data, dblp_data)
    chart2 = processC2(terms, years, difference)
    chart3 = processC3(terms, years, sum_score)
    chart4 = processC4(terms, years, difference_yearly)
    return chart1, chart2, chart3, chart4 

In [13]:
def write_to_json(chart1, chart2, chart3, chart4):
    with open('html/data.js', 'w', encoding='utf-8') as f:
        f.write('const data_c1 = ')
        json.dump(chart1, f, ensure_ascii=False, indent=4)
        f.write(';\n')
        f.write('const data_c2 = ')
        json.dump(chart2, f, ensure_ascii=False, indent=4)
        f.write(';\n')
        f.write('const data_c3 = ')
        json.dump(chart3, f, ensure_ascii=False, indent=4)
        f.write(';\n')
        f.write('const data_c4 = ')
        json.dump(chart4, f, ensure_ascii=False, indent=4)
        f.write(';\n')

In [14]:
user_terms = input("Enter a List of comma-ceparated Search Terms (empty for default):")
if(user_terms):
    terms = [[x.strip()] for x in user_terms.split(',')]
else:
    terms = [['machine learning'],['artificial intelligence'],['robotics'],['natural language processing'],['networking']]

user_years = input("Enter a List of comma-separated years (empty for default):")

if(user_years):
    years = [x.strip() for x in user_years.split(',')]
else:
    years=[str(2010+i) for i in range(11)]

Enter a List of comma-ceparated Search Terms (empty for default):
Enter a List of comma-separated years (empty for default):


In [15]:
print("Collecting Data For: " + str(np.array(terms).flatten().tolist()) + "...")

trends_data, dblp_data = scrape_data(terms, years)

print("Processing into Javascript...")

chart1, chart2, chart3, chart4 = process_data(terms, years, trends_data, dblp_data)
write_to_json(chart1, chart2, chart3, chart4)

Collecting Data For: ['machine learning', 'artificial intelligence', 'Robotics', 'natural language processing', 'Networking']...


  0%|          | 0/5 [00:00<?, ?it/s]

Processing into Javascript...
