In [None]:
#run this cell if your environment does not have the googleapiclient module
%pip install google-api-python-client


In [1]:
'''this notebook creates a column type dictionary for Tascomi ingestion process'''

import json
import boto3
from googleapiclient.discovery import build
from google.oauth2 import service_account


In [2]:
def get_secret(secret_name, region_name):
    session = boto3.session.Session()

    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )

    if 'SecretString' in get_secret_value_response:
        return get_secret_value_response['SecretString']
    else:
        return get_secret_value_response['SecretBinary'].decode('ascii')
    

In [3]:
google_sheet_id = '1ZZwWHSoudBgN9j0jV6ZrNZKgXYMOm7ObWTWLT3Xg8Rw'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
secret_key = 'arn:aws:secretsmanager:eu-west-2:120038763019:secret:sheets-credential-sandbox-20220211181906651800000002-b9jt5Y'
secret_region = "eu-west-2"


In [4]:
awsSecret = get_secret(secret_key, secret_region)
googleSheetsJsonCredentials = json.loads(awsSecret)
googleSheetsCredentials = service_account.Credentials.from_service_account_info(googleSheetsJsonCredentials)
googleSheetsScopedCredentials = googleSheetsCredentials.with_scopes(SCOPES)
service = build(
    'sheets',
    'v4',
    credentials=googleSheetsScopedCredentials,
    cache_discovery=False)

In [5]:
gsheet_workbook = service.spreadsheets().get(spreadsheetId=google_sheet_id).execute()
sheets = gsheet_workbook.get('sheets')

rows = []
for sheet in sheets:
    data = service.spreadsheets().values().get(
            spreadsheetId=google_sheet_id,
            range=sheet['properties']['title'],
            majorDimension='ROWS'
        ).execute()

    for i in data['values']:
        i.append(sheet['properties']['title'])

    rows.extend(data['values'][1:])

type_map={'real': 'float',
          'double precision': 'double',
          'bigint': 'long',
          'timestamp without time zone': 'timestamp'}

rows_types_replaced = []
for lst in rows:
    if lst[1] in type_map:
        lst[1]=type_map.get(lst[1])
        rows_types_replaced.append(lst)
    else:
        rows_types_replaced.append(lst)

types_to_include=['integer', 'timestamp', 'boolean', 'float', 'double', 'long']

rows_for_col_dict=[]
for lst in rows_types_replaced:
    if lst[1] in types_to_include:
        rows_for_col_dict.append(lst)
    else:
        pass

result = dict()
for value, key1, key2 in rows_for_col_dict:
    result.setdefault(key1, {}).setdefault(key2, list()).append(value)

result_json = json.dumps(result, indent=2)


In [6]:
print(f'[\n {result_json}\n]')


[
 {
  "integer": {
    "user_team_map": [
      "creation_user_id",
      "id",
      "last_updated_by",
      "user_id",
      "user_team_id"
    ],
    "user_teams": [
      "creation_user_id",
      "id",
      "last_updated_by"
    ],
    "committee_application_map": [
      "application_id",
      "committee_id",
      "creation_user_id",
      "id",
      "last_updated_by",
      "load_order"
    ],
    "applications": [
      "admin_officer_id",
      "agent_id",
      "agricultural_holding_involved_owner_details",
      "agricultural_land_use_months",
      "agricultural_land_use_years",
      "applicant_id",
      "application_source_id",
      "application_stage",
      "application_type_id",
      "approved_units",
      "breeam_level_id",
      "breeam_status_id",
      "car_difference",
      "car_total_existing",
      "car_total_proposed",
      "certificate_b_owner_details",
      "certificate_c_owner_details",
      "certificate_of_compliance_b_owner_details",
      "