Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Add information from google analytic to index.
  • Loading branch information
satamas committed Aug 4, 2017
1 parent 17b8a6f commit 25dc1a0
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -33,3 +33,4 @@ grammar.xml
pdf/kotlin-docs.pdf
pdf/tmp.html
*.pyc
google-credentials.json
2 changes: 2 additions & 0 deletions Dockerfile
@@ -1,5 +1,7 @@
FROM python:3

COPY google-credentials.json /secrets/google-credentials.json

RUN pip install --no-cache-dir virtualenv;

RUN export DEBIAN_FRONTEND=noninteractive; \
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Expand Up @@ -7,4 +7,5 @@ beautifulsoup4==4.5.1
xmltodict==0.10.2
geocoder==1.20.0
ruamel.yaml==0.13.11
algoliasearch==1.12.0
algoliasearch==1.12.0
google-api-python-client==1.6.2
85 changes: 78 additions & 7 deletions src/search.py
@@ -1,12 +1,75 @@
import os
from os import path
from typing import Dict

from algoliasearch import algoliasearch
from algoliasearch.index import Index
from bs4 import BeautifulSoup, Tag
from flask import current_app as app
from googleapiclient.discovery import build, Resource
from oauth2client.service_account import ServiceAccountCredentials

from src.api import get_api_page

root_folder = path.dirname(path.dirname(__file__))


def initialize_analyticsreporting() -> Resource:
credentials = ServiceAccountCredentials.from_json_keyfile_name(
os.environ['KEY_FILE_LOCATION'], scopes='https://www.googleapis.com/auth/analytics.readonly')
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics


def get_report(analytics: Resource) -> Dict:
return analytics.reports().batchGet(
body={
"reportRequests":
[
{
"viewId": "85132606",
"samplingLevel": "LARGE",
"filtersExpression": "ga:hostname==kotlinlang.org;ga:pagepath!@?",
"pageSize": 10000,
"orderBys": [
{
"fieldName": "ga:uniquepageviews",
"sortOrder": "DESCENDING"
}
],
"dateRanges":
[
{
"startDate": "30daysAgo",
"endDate": "yesterday"
}
],
"metrics":
[
{
"expression": "ga:uniquepageviews",
"alias": ""
}
],
"dimensions":
[
{
"name": "ga:pagePath"
}
]
}
]
}).execute()


def get_page_views_statistic() -> Dict[str, int]:
page_views = {}
analytics = initialize_analyticsreporting()
report = get_report(analytics)
for row in report["reports"][0]["data"]["rows"]:
page_views[row["dimensions"][0]] = int(row['metrics'][0]["values"][0])
return page_views


def get_client():
return algoliasearch.Client(os.environ['SEARCH_USER'], os.environ['SEARCH_KEY'])
Expand Down Expand Up @@ -44,8 +107,6 @@ def group_small_content_pats(content_parts, start_index=0):

def get_valuable_content(page_content):
content = []
if isinstance(page_content, basestring):
page_content = BeautifulSoup(page_content, "html.parser")
for child in page_content.children:
if not isinstance(child, Tag):
continue
Expand All @@ -61,7 +122,7 @@ def get_valuable_content(page_content):
return content


def get_page_index_objects(content, url, page_path, title, type, description=None):
def get_page_index_objects(content, url, page_path, title, type, page_views: int, description=None):
index_objects = []
for ind, page_part in enumerate(get_valuable_content(content)):
page_info = {
Expand All @@ -70,7 +131,8 @@ def get_page_index_objects(content, url, page_path, title, type, description=Non
'content': page_part,
'title': title,
'type': type,
'description': description
'description': description,
'pageViews': page_views
}
if description is not None:
page_info['description'] = description
Expand All @@ -81,10 +143,15 @@ def get_page_index_objects(content, url, page_path, title, type, description=Non


def build_search_indices(site_structure, pages):
page_views_statistic = get_page_views_statistic()
index_objects = []
for url, endpoint in site_structure:
if (not url.endswith('.html')) and (not url.endswith('/')):
continue
if url in page_views_statistic:
page_views = page_views_statistic[url]
else:
page_views = 0
page_path = get_page_path_from_url(url)
if endpoint == 'page':
page = pages.get(page_path)
Expand All @@ -104,6 +171,7 @@ def build_search_indices(site_structure, pages):
page_path,
page.meta['title'],
type,
page_views,
description
)
elif endpoint == "api_page":
Expand All @@ -120,7 +188,8 @@ def build_search_indices(site_structure, pages):
url,
page_path,
page_info['title'],
"Standard Library"
"Standard Library",
page_views
)
elif endpoint in ["coroutines_alias", "events_redirect", "community_redirect"]:
continue
Expand All @@ -142,15 +211,17 @@ def build_search_indices(site_structure, pages):
'type': 'Page',
'title': title,
'url': url,
'content': ''
'content': '',
'pageViews': page_views
})
else:
index_objects += get_page_index_objects(
content,
url,
page_path,
title,
"Page"
"Page",
page_views
)
index = get_index()
index.add_objects(index_objects)

0 comments on commit 25dc1a0

Please sign in to comment.