In [None]:
from api import *

## Getting Started

1. Enter keys in config-demo.py and rename the file to config.py
2. All keys are not needed for all API services.
3. You can set PROXY_ENPOINT to '' to bypass using a proxy.  Otherwise, could be used for rendertron (eg 'https://my-project.appspot.com/render/<URL\>' )


## GSC Service

In [10]:
gsc_property = 'https://adaptpartners.com'
days_back = 30

'''
Parameters:

Positional:
clienturl: (str) The domain URL property name in Google Search Console.
days_back: (int) How many days history to pull.

Keyword:
thresholdtype: (str)  'click' or 'impression'. Default: impression
threshold: (int) Keep pulling, daily until less than this number of either clicks or impressions. Default: 1
poslimit: (int) Omit results above this limit. Default: None
country: (str) Country. Default: usa
outputFn: (str) Name of the output file.  If not set, a unique name will be chosen.
'''

df = gscservice.get_site_data(gsc_property, days_back, output_fn="demo2.csv")
df.head()

Reloading Existing: data/demo2.csv


Unnamed: 0,clicks,clientID,ctr,impressions,month,page,position,query
0,2.0,https://adaptpartners.com,0.666667,3.0,2018-10,https://adaptpartners.com/,1,adapt partners
1,2.0,https://adaptpartners.com,0.4,5.0,2018-10,https://adaptpartners.com/job/political-journa...,2,political internships
2,1.0,https://adaptpartners.com,0.052632,19.0,2018-10,https://adaptpartners.com/technical-seo/python...,4,google search console api
3,0.0,https://adaptpartners.com,0.0,11.0,2018-10,https://adaptpartners.com/,13,adapt
4,0.0,https://adaptpartners.com,0.0,1.0,2018-10,https://adaptpartners.com/,14,adapt agency


## URL Extraction

In [11]:

url = 'https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/'
text, infos = extract_url_data(url)
print('\n')
print("Title:", infos['title'])
print("H1:", infos['h1'])
print("Extracted Text:", text)

loading url https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/
Extracted 509 words.


Title: Python Notebooks: Connect to Google Search Console API and Extract Data
H1: ['Python Notebooks: Connect to Google Search Console API and Extract Data']
Extracted Text: On this post we want to show you an easy way that you can use Python notebooks to connect to Google’s Search Console API. After connecting to the API, you will be able to do several interesting things.

The first thing you need is to create a new Oauth Credential in Google Developers Console and select “Other” as type. Google provides detailed information on how to set this up here.

After completing these steps you’ll have a CLIENT_ID and CLIENT_SECRET that you will need to use in this notebook in order to connect to Google Search Console. Dominic Woodman’s post in Moz’s blog shows easy step by step instructions on how you can set this up on Google.

Google provides in

## Watson API

In [2]:
url = 'https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/'
text, infos = extract_url_data(url)
html = infos['html']

print('\n Entities:')
print(watsonservice.watson_entities(html))

print('\n Keywords:')
print(watsonservice.watson_keywords(html))

loading url https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/
Extracted 509 words.

 Entities:
   count                                     disambiguation  relevance  \
0      9  {'subtype': ['AcademicInstitution', 'AwardPres...   0.876669   
1      1                                                NaN   0.169787   

              text     type  
0           Google  Company  
1  Dominic Woodman   Person  

 Keywords:
    relevance                       text
0    0.984217      Google Search Console
1    0.715410  Google Developers Console
2    0.653734         Search Console API
3    0.573839        Search Console data
4    0.562398         interesting things
5    0.562259       new Oauth Credential
6    0.528074            shows easy step
7    0.474809           Python notebooks
8    0.447963            Dominic Woodman
9    0.443203                   easy way
10   0.434548       detailed information
11   0.426069          ste

## SEMRush

In [None]:
# Uses this library: https://github.com/storerjeremy/python-semrush
# See Readme for implentation 

domain = 'adaptpartners.com'
database = 'us'
ranks = semrushservice.domain_organic(domain, database)
ranks.head()

## Google Analytics

In [7]:
# Uses this library: https://github.com/debrouwere/google-analytics/wiki/Querying
# See link for usage instructions.

ga = gaservice

ga_account = "Adapt Partners" 
ga_webproperty = "Adapt Partners" 
ga_profile = "Adapt Partners"

profile = gaservice.get_profile(account=ga_account, webproperty=ga_webproperty, profile=ga_profile )
query = profile.core.query.daily(days=-5)
query.metrics('sessions', 'pageviews').as_dataframe()


Unnamed: 0,date,sessions,pageviews
0,2018-11-07,59,88
1,2018-11-08,55,66
2,2018-11-09,38,46
3,2018-11-10,23,34
4,2018-11-11,22,27


# Dataset

In [3]:
# Create dataset from Google Search Console data for usage in Machine Learning projects.

import dataset

gsc_property = 'https://adaptpartners.com'
days_back = 30

df = gscservice.get_site_data(gsc_property, days_back, output_fn="demo2.csv")
df.head()

features = df[['position','impressions']]
labels = df[['clicks']]

data_loader = dataset.load_pandas(features, labels, batch_size=32, shuffle=True, drop_last=True)

Reloading Existing: data/demo2.csv
# training samples: 6115
# batches: 191


# Bert

In [None]:
# Implements the BERT model for usage in machine learning projects.

import dataset

gsc_property = 'https://adaptpartners.com'
days_back = 30

df = gscservice.get_site_data(gsc_property, days_back, output_fn="demo2.csv")
df.head()

features = df[['position','impressions']]
labels = df[['clicks']]

def apply_embed(row):
    embed = row['embedding']
    for i, e in enumerate(embed):
        row['e_'+str(i)] = e
    return row

data_loader_bert, df_bert = dataset.load_bert_df(input_df=df, input_row="query")

df_bert_embed = df_bert.apply(apply_embed,axis=1).drop(columns=['embedding','linex_index','tokens'])

features = pd.concat([features.reset_index(drop =True), df_bert_embed.reset_index(drop =True)], axis=1)

data_loader = dataset.load_pandas(features, labels, batch_size=32, shuffle=True, drop_last=True)

Reloading Existing: data/demo2.csv


11/29/2018 08:25:28 - INFO - dataset.bert_ds -   device: cpu n_gpu: 1 distributed training: False
11/29/2018 08:25:29 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\jroak\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***
11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 0
11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] adapt partners [SEP]
11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 15581 5826 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11/29/