In [56]:
import pandas as pd
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials

In [57]:
import pprint
import csv

In [58]:
KEY_FILE_LOCATION = 'client_secrets_v4.json'

In [59]:
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']

In [60]:
credentials = ServiceAccountCredentials.from_json_keyfile_name(
      KEY_FILE_LOCATION, SCOPES)

In [61]:
analytics = build('analyticsreporting', 'v4', credentials=credentials)

In [62]:
vids = ['124490020']
r_names = ['users_sessions','traffic','devices','sessions']
r_params = {'users_sessions':{'metrics':['ga:users','ga:sessions','ga:sessionDuration','ga:hits'],
                              'dims':['ga:dateHour','ga:userType']},
            'traffic':{'metrics':[],
                       'dims':['ga:date','ga:source','ga:referralPath','ga:keyword']},
            'devices':{'metrics':[],
                       'dims':['ga:date','ga:deviceCategory','ga:browser','ga:browserVersion','ga:operatingSystem']},
            'sessions':{'metrics':['ga:entrances','ga:exits','ga:uniquePageviews','ga:avgTimeOnPage','ga:pageviews'],
                        'dims':['ga:date','ga:pagePath','ga:hostname']}}

In [63]:
def runReports(startDate,endDate):
    for vid in vids:
        for name in r_names:
            p = r_params.get(name)
            metric_get = []
            dim_get = []
            for metric in p.get('metrics'):
                metric_get.append({'expression': metric})
            for dim in p.get('dims'):
                dim_get.append({'name':dim})
            body = {
                'reportRequests': [
                    {'viewId': vid,
                    'dateRanges':[{'startDate': startDate, 'endDate': endDate}],
                    'samplingLevel': 'LARGE',
                    'metrics': metric_get,
                    'dimensions': dim_get,
                    'page_size': 
                    }]}
            response = analytics.reports().batchGet(body=body).execute()
            total_data = []
            for i, report in enumerate(response.get('reports', [])):
                rows = report.get('data', {}).get('rowCount', [])
                if (rows > 1000):
                    total_data.append(report.get('data', {}).get('rows', []))
                    token = report.get('nextPageToken', {})
                    body['reportRequests'][0]['pageToken'] = token
                    pprint.pprint(body)

SyntaxError: invalid syntax (<ipython-input-63-b5a6ca944bb6>, line 19)

In [None]:
#runReports('2018-01-01','2018-12-31')

In [64]:
start_date = '2016-01-01'
end_date = '2016-12-31'
response = analytics.reports().batchGet(
    body={
        'reportRequests': [
            {'viewId': '124490020',
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:users'},
                        {'expression': 'ga:sessions'},
                        {'expression': 'ga:sessionDuration'},
                        {'expression':'ga:hits'},
                       ],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:hour'},
                           {'name':'ga:userType'}],
             'pageSize':100000
            },
            {'viewId': '124490020',
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:sessions'}],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:source'},
                           {'name':'ga:referralPath'},
                           {'name':'ga:keyword'}],
             'pageSize':100000
            },
            {'viewId': '124490020',
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:sessions'}],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:deviceCategory'},
                           {'name':'ga:browser'},
                           {'name':'ga:browserVersion'},
                           {'name':'ga:operatingSystem'}],
             'pageSize':100000
            },
            {'viewId': '124490020',
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression':'ga:entrances'},
                        {'expression':'ga:exits'},
                        {'expression':'ga:uniquePageviews'},
                        {'expression':'ga:avgTimeOnPage'},
                        {'expression':'ga:pageviews'},
                       ],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:hostname'},
                           {'name':'ga:pagePathLevel1'},
                           {'name':'ga:pagePathLevel2'},
                           {'name':'ga:pagePathLevel3'}
                           ],
             'pageSize':100000
            }
        ]}).execute()

In [65]:
def getDimensions(report):
    columnHeader = report.get('columnHeader', {})
    dimensionHeaders = columnHeader.get('dimensions', [])
    dim_no = len(dimensionHeaders)
    data = report.get('data', {}).get('rows', [])
    print(report.get('data', {}).get('rowCount', []))
    df_rows = []
    for d in data:
        dimensions = d['dimensions']
        dict_row = {}
        for i in range(dim_no):
            dict_row[dimensionHeaders[i]] = dimensions[i]
        df_rows.append(dict_row)
        
    df = pd.DataFrame(df_rows,columns=dimensionHeaders)
    return df

In [66]:
def getMetrics(report):
    columnHeader = report.get('columnHeader', {})
    metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
    metricHeadersList = []
    for metric in metricHeaders:
        metricHeadersList.append(metric['name'])
    met_no = len(metricHeadersList)
    data = report.get('data', {}).get('rows', [])
    df_rows = []
    for d in data:
        metrics = d['metrics'][0]['values']
        dict_row = {}
        for i in range(met_no):
            dict_row[metricHeadersList[i]] = metrics[i]
        df_rows.append(dict_row)
        
    df = pd.DataFrame(df_rows,columns=metricHeadersList)
    return df

In [67]:
reports = []
for i, report in enumerate(response.get('reports', [])):
    dims = getDimensions(report)
    mets = getMetrics(report)
    df = pd.merge(dims,mets,how="left",left_index=True,right_index=True)
    final_df = df
    reports.append(final_df)

[]
[]
[]
[]


In [68]:
users_sessions_report = reports[0]
traffic_report = reports[1]
devices_report = reports[2]
pages_report = reports[3]

In [69]:
users_sessions_report.shape

(0, 7)

In [70]:
traffic_report.shape

(0, 5)

In [71]:
devices_report.shape

(0, 6)

In [72]:
pages_report.shape

(0, 10)

In [73]:
users_sessions_report.head()

Unnamed: 0,ga:date,ga:hour,ga:userType,ga:users,ga:sessions,ga:sessionDuration,ga:hits


In [74]:
users_sessions_report.columns = ['date','hour','user_type','users','sessions','session_duration','hits']

In [75]:
traffic_report.head()

Unnamed: 0,ga:date,ga:source,ga:referralPath,ga:keyword,ga:sessions


In [76]:
traffic_report.columns = ['date','referral_source','referral_path','keyword','sessions']

In [77]:
devices_report.head()

Unnamed: 0,ga:date,ga:deviceCategory,ga:browser,ga:browserVersion,ga:operatingSystem,ga:sessions


In [78]:
devices_report.columns = ['date','device_cat','browser','browser_version','op_system','sessions']

In [79]:
pages_report.head()

Unnamed: 0,ga:date,ga:hostname,ga:pagePathLevel1,ga:pagePathLevel2,ga:pagePathLevel3,ga:entrances,ga:exits,ga:uniquePageviews,ga:avgTimeOnPage,ga:pageviews


In [80]:
pages_report.columns = ['date','hostname','page_path_1','page_path_2','page_path_3','entrances','exits','unique_pageviews','avg_time_on_page','pageviews']

In [81]:
users_sessions_report['date'] = pd.to_datetime(users_sessions_report['date'])
traffic_report['date'] = pd.to_datetime(traffic_report['date'])
devices_report['date'] = pd.to_datetime(devices_report['date'])
pages_report['date'] = pd.to_datetime(pages_report['date'])

In [82]:
users_sessions_report.to_csv('~/Code/docker-airflow/data/prod/portal_users_sessions_2016_datasd.csv',index=False,date_format="%Y-%m-%d")

In [83]:
traffic_report.to_csv('~/Code/docker-airflow/data/prod/portal_traffic_sources_2016_datasd.csv',index=False,date_format="%Y-%m-%d")

In [84]:
devices_report.to_csv('~/Code/docker-airflow/data/prod/portal_devices_platforms_2016_datasd.csv',index=False,date_format="%Y-%m-%d")

In [85]:
pages_report.to_csv('~/Code/docker-airflow/data/prod/portal_pages_2016_datasd.csv',index=False,date_format="%Y-%m-%d")