In [1]:
import pandas as pd
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials

In [2]:
import pprint
import csv

In [3]:
KEY_FILE_LOCATION = 'client_secrets_v4.json'

In [4]:
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']

In [5]:
credentials = ServiceAccountCredentials.from_json_keyfile_name(
      KEY_FILE_LOCATION, SCOPES)

In [6]:
analytics = build('analyticsreporting', 'v4', credentials=credentials)

In [7]:
view_ids = ['124490020','83386397']

In [9]:
start_date = '2019-01-01'
end_date = '2019-12-31'
response = analytics.reports().batchGet(
    body={
        'reportRequests': [
            {'viewId': view_ids[1],
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:users'},
                        {'expression': 'ga:sessions'},
                        {'expression': 'ga:sessionDuration'},
                        {'expression':'ga:hits'},
                       ],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:hour'},
                           {'name':'ga:userType'}],
             'pageSize':100000
            },
            {'viewId': view_ids[1],
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:sessions'}],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:source'},
                           {'name':'ga:referralPath'},
                           {'name':'ga:keyword'}],
             'pageSize':100000
            },
            {'viewId': view_ids[1],
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression': 'ga:sessions'}],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:deviceCategory'},
                           {'name':'ga:browser'},
                           {'name':'ga:browserVersion'},
                           {'name':'ga:operatingSystem'}],
             'pageSize':100000
            },
            {'viewId': view_ids[1],
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression':'ga:entrances'},
                        {'expression':'ga:exits'},
                        {'expression':'ga:uniquePageviews'},
                        {'expression':'ga:avgTimeOnPage'},
                        {'expression':'ga:pageviews'},
                        {'expression': 'ga:users'}
                       ],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:hostname'},
                           {'name':'ga:pagePathLevel1'},
                           {'name':'ga:pagePathLevel2'},
                           {'name':'ga:pagePathLevel3'}
                           ],
             'pageSize':100000
            },
            {'viewId': view_ids[1],
            'dateRanges': [{'startDate': start_date,'endDate': end_date}],
            'samplingLevel': 'LARGE',
            'metrics': [{'expression':'ga:totalEvents'},
                        {'expression':'ga:users'}
                       ],
            'dimensions': [{'name':'ga:date'},
                           {'name':'ga:eventCategory'},
                           {'name':'ga:eventAction'},
                           {'name':'ga:eventLabel'}
                           ],
             'pageSize':100000
            }
        ]}).execute()

In [10]:
def getDimensions(report):
    columnHeader = report.get('columnHeader', {})
    dimensionHeaders = columnHeader.get('dimensions', [])
    dim_no = len(dimensionHeaders)
    data = report.get('data', {}).get('rows', [])
    print(report.get('data', {}).get('rowCount', []))
    df_rows = []
    for d in data:
        dimensions = d['dimensions']
        dict_row = {}
        for i in range(dim_no):
            dict_row[dimensionHeaders[i]] = dimensions[i]
        df_rows.append(dict_row)
        
    df = pd.DataFrame(df_rows,columns=dimensionHeaders)
    return df

In [11]:
def getMetrics(report):
    columnHeader = report.get('columnHeader', {})
    metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
    metricHeadersList = []
    for metric in metricHeaders:
        metricHeadersList.append(metric['name'])
    met_no = len(metricHeadersList)
    data = report.get('data', {}).get('rows', [])
    df_rows = []
    for d in data:
        metrics = d['metrics'][0]['values']
        dict_row = {}
        for i in range(met_no):
            dict_row[metricHeadersList[i]] = metrics[i]
        df_rows.append(dict_row)
        
    df = pd.DataFrame(df_rows,columns=metricHeadersList)
    return df

In [12]:
reports = []
for i, report in enumerate(response.get('reports', [])):
    dims = getDimensions(report)
    mets = getMetrics(report)
    df = pd.merge(dims,mets,how="left",left_index=True,right_index=True)
    final_df = df
    reports.append(final_df)

7490
34319
21455
82590
192


In [13]:
users_sessions_report = reports[0]
traffic_report = reports[1]
devices_report = reports[2]
pages_report = reports[3]
events_report = reports[4]

In [14]:
users_sessions_report.shape

(7490, 7)

In [15]:
traffic_report.shape

(34319, 5)

In [16]:
devices_report.shape

(21455, 6)

In [17]:
pages_report.shape

(82590, 11)

In [18]:
users_sessions_report.head()

Unnamed: 0,ga:date,ga:hour,ga:userType,ga:users,ga:sessions,ga:sessionDuration,ga:hits
0,20190101,0,New Visitor,749,749,26554.0,1066
1,20190101,0,Returning Visitor,144,144,5979.0,158
2,20190101,1,New Visitor,764,735,19595.0,1066
3,20190101,1,Returning Visitor,115,115,8328.0,231
4,20190101,2,New Visitor,605,605,3098.0,634


In [19]:
users_sessions_report.columns = ['date','hour','user_type','users','sessions','session_duration','hits']

In [20]:
traffic_report.head()

Unnamed: 0,ga:date,ga:source,ga:referralPath,ga:keyword,ga:sessions
0,20190101,(direct),(not set),(not set),21871
1,20190101,10news.com,/,(not set),29
2,20190101,10news.com,/news/city-launches-app-to-help-customers-moni...,(not set),14
3,20190101,agency.governmentjobs.com,//sandiego/default.cfm,(not set),43
4,20190101,agency.governmentjobs.com,/sandiego/default.cfm,(not set),115


In [21]:
traffic_report.columns = ['date','referral_source','referral_path','keyword','sessions']

In [22]:
devices_report.head()

Unnamed: 0,ga:date,ga:deviceCategory,ga:browser,ga:browserVersion,ga:operatingSystem,ga:sessions
0,20190101,desktop,Chrome,49.0.2623.112,Windows,29
1,20190101,desktop,Chrome,54.0.2840.71,Windows,14
2,20190101,desktop,Chrome,57.0.2987.133,Windows,14
3,20190101,desktop,Chrome,58.0.2988.0,Windows,14
4,20190101,desktop,Chrome,63.0.3239.132,Windows,14


In [23]:
devices_report.columns = ['date','device_cat','browser','browser_version','op_system','sessions']

In [24]:
pages_report.head()

Unnamed: 0,ga:date,ga:hostname,ga:pagePathLevel1,ga:pagePathLevel2,ga:pagePathLevel3,ga:entrances,ga:exits,ga:uniquePageviews,ga:avgTimeOnPage,ga:pageviews,ga:users
0,20190101,apps.sandiego.gov,/citizenacct/,/reset/,/doReset.do,0,0,14,3.589905362776025,317,14
1,20190101,apps.sandiego.gov,/citizenacct/,/reset/,/setupHint.do,0,0,14,14.483290488431876,389,14
2,20190101,apps.sandiego.gov,/citizenacct/,/reset/,/toMain.do,0,0,14,2.121212121212121,231,14
3,20190101,apps.sandiego.gov,/citizenacct/,/signup/,/addUser.do,0,14,14,17.208333333333332,86,14
4,20190101,www.sandiego.gov,/airports/,/brown/,/brownfaq,0,14,14,0.0,14,14


In [25]:
pages_report.columns = ['date','hostname','page_path_1','page_path_2','page_path_3','entrances','exits','unique_pageviews','avg_time_on_page','pageviews','users']

In [26]:
events_report.head()

Unnamed: 0,ga:date,ga:eventCategory,ga:eventAction,ga:eventLabel,ga:totalEvents,ga:users
0,20190102,ShareThis,facebook,https://www.sandiego.gov/blog/learn-bicycle-re...,1,1
1,20190102,ShareThis,facebook,https://www.sandiego.gov/blog/trash-pick-up-sc...,1,1
2,20190102,ShareThis,twitter,https://www.sandiego.gov/stadium/eventscalendar,1,1
3,20190104,ShareThis,linkedin,https://www.sandiego.gov/attractions,1,1
4,20190107,ShareThis,facebook,https://www.sandiego.gov/blog/find-art-share-art,1,1


In [27]:
events_report.columns = ['date','event_category','event_action','event_label','total_events','users']

In [28]:
users_sessions_report['date'] = pd.to_datetime(users_sessions_report['date'])
traffic_report['date'] = pd.to_datetime(traffic_report['date'])
devices_report['date'] = pd.to_datetime(devices_report['date'])
pages_report['date'] = pd.to_datetime(pages_report['date'])
events_report['date'] = pd.to_datetime(events_report['date'])

In [29]:
users_sessions_report.to_csv('~/Code/docker-airflow/data/prod/main_users_sessions_2019_datasd.csv',index=False,date_format="%Y-%m-%d")

In [30]:
traffic_report.to_csv('~/Code/docker-airflow/data/prod/main_traffic_sources_2019_datasd.csv',index=False,date_format="%Y-%m-%d")

In [31]:
devices_report.to_csv('~/Code/docker-airflow/data/prod/main_devices_platforms_2019_datasd.csv',index=False,date_format="%Y-%m-%d")

In [32]:
pages_report.to_csv('~/Code/docker-airflow/data/prod/main_pages_2019_datasd.csv',index=False,date_format="%Y-%m-%d")

In [33]:
events_report.to_csv('~/Code/docker-airflow/data/prod/main_events_2019_datasd.csv',index=False,date_format="%Y-%m-%d")