In [1]:
# import necessary packages
import os
import re
import string
import time
from datetime import date, datetime, timezone,timedelta
import logging
from logging.handlers import TimedRotatingFileHandler
import io

import joblib
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from dotenv import load_dotenv
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
from boxsdk import Client, JWTAuth

from queryrunner_client import Client as qr_client
import config
import sendgrid

load_dotenv(".env")

sg = sendgrid.SendGridAPIClient(config.API_KEY)

In [2]:
# Set up logger
logger = logging.getLogger("main")

# set logging level : INFO, DEBUG, WARNING or ERROR
logger.setLevel(logging.DEBUG)

# Create TimedRotatingFileHandler with log file name
# It will create a new log file each day at midnight
handler = TimedRotatingFileHandler(config.LOGFILE_INCA, when="midnight", interval=1)

# This is the format in which logs will be displayed in log file
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# assign the formatter and suffix to file_handler object
# suffix will be added to each file
handler.setFormatter(formatter)
handler.suffix = "%Y%m%d"


# add the handler to logger
logger.addHandler(handler)
start_time = time.time()
logger.debug('New Job has started ....')

2023-08-22 10:49:42,799 main DEBUG New Job has started ....


In [3]:
current_date = date.today()

In [4]:
HIVE_QUERY = f"""
with target_merchants as (
    with us_states as (
      select
        geofence_uuid,
        name as state_name,
        simplified_shape as simplified_shape_wkt,
        areasqmeters,
        countrycode
      from
        map_geofences.geofences_mbi_admin
      where
        namespace = 'canonical-mbi-admin-1'
        and (
          countrycode = 'US'
          or countrycode = 'CA'
        )
    ),
    us_counties as (
      select
        geofence_uuid,
        name as county_name,
        simplified_shape as simplified_shape_wkt,
        areasqmeters,
        countrycode
      from
        map_geofences.geofences_mbi_admin
      where
        (
          namespace = 'canonical-mbi-admin-4'
          and countrycode = 'US'
        )
        or (
          namespace = 'canonical-mbi-admin-5'
          and countrycode = 'CA'
        )
    )
    ----------------------------
    ----------------------------
    select
      dm.uuid
    from
      eds.dim_merchant dm
      left join us_states st on st_contains(
        st.simplified_shape_wkt,
        st_point(dm.longitude, dm.latitude)
      )
      left join us_counties co on st_contains(
        co.simplified_shape_wkt,
        st_point(dm.longitude, dm.latitude)
      )
      where
        st.geofence_uuid is not null or
        co.geofence_uuid is not null
    group by
      1
)

, allinfo as (
select
  ext.entity_uuid,
  ent.external_id,
  ext.catalog_uuid,
  coalesce(ext.product.name, raw_ext.product.name) as product_name,
  coalesce(ext.product.description, raw_ext.product.description) as product_description,
  ext.product.merchant_category_path as published_merchant_category_path,
  raw_ext.product.merchant_category_path as raw_merchant_category_path,
  ext.tax_labels,
  dm.uuid as store_uuid,
  dm.store_name,
  dm.establishment_type as establishment_type,
  dm.uber_merchant_type as uber_merchant_type,
  FROM_ISO8601_TIMESTAMP(ext.updated_at) as updated_at
from
  inca.dim_catalog c
  cross join unnest(used_by_stores) as t(store_uuid)
  join eds.dim_merchant dm on dm.uuid = t.store_uuid
  join target_merchants tm on tm.uuid = dm.uuid
  and dm.parent_chain_uuid not in (
    '99db4bf9-629f-4b75-a34c-e6628dc66f40' --Exclude Walmart Canada eff 4/18/2023. Discussion that CS mapping more accurate than ML; immediately stop bleeding for ML incorrect tagging for 4/20 launch.
  )
  join inca.extension_main ext on
    ext.catalog_uuid = c.catalog_uuid
    and ext.source = 'published'
    
    and ext.tax_labels is null
    
    and ext.entity_type = 'PRODUCT'
  join inca.extension_main raw_ext on
    raw_ext.catalog_uuid = c.catalog_uuid
    and raw_ext.entity_uuid = ext.entity_uuid
    and raw_ext.source = 'raw'
    
    and raw_ext.tax_labels is null
    
    and raw_ext.entity_type = 'PRODUCT'
  join inca.entity_main ent on
    ext.catalog_uuid = ent.catalog_uuid
    and ent.entity_uuid = ext.entity_uuid
    and ent.source = 'published'
WHERE
    1 = 1
    and FROM_ISO8601_TIMESTAMP(ext.updated_at) > now() - interval '{config.DAYS_BACK}' day
    
)

SELECT
    store_uuid as merchant_uuid
    , store_name
    , catalog_uuid
    , entity_uuid
    , product_name
    , product_description
    , establishment_type
    , uber_merchant_type
    , published_merchant_category_path
    , raw_merchant_category_path
    , tax_labels
    , updated_at
FROM allinfo limit 900000

"""

In [5]:
def preprocess_text(message):

    # stopwords
    stpwrd = nltk.corpus.stopwords.words('english')
    # initialize lemmatizing 
    lemmatizer = WordNetLemmatizer()
    # lowering input
    message = message.lower()
    # removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message)
    # removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd])
    # lemmatizing the text
    message = " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    # removing hyperlinks
    message = re.sub(r'http\S+', ' ', message)
    return message

In [6]:
model = joblib.load(config.MODEL_PATH)
# establish a connection for BOX
boxconfig = JWTAuth.from_settings_file(config.TAX_CONFIG)
client = Client(boxconfig)





# dedup version

In [7]:
def dedup(df, sub_folder):
    # combining input features to a single column 
    df['combined_text'] = df[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
    # apply data preprocessing steps on the combined_text column
    df['processed_text'] = df['combined_text'].map(lambda s:preprocess_text(s))
    df['product_name_lower'] = df['product_name'].str.lower()
    df['product_description_lower'] = df['product_description'].str.lower()
    df['establishment_type_lower'] = df['establishment_type'].str.lower()
    # removing duplicates
    dedup_df = df.drop_duplicates(subset=['product_name_lower','product_description_lower','establishment_type_lower'],ignore_index=True)
    logger.info("Incoming dedup data size: {}".format(dedup_df.shape))
    print(dedup_df.shape)
    # getting ml predictions
    dedup_df['ml_prediction'] = model.predict(dedup_df['processed_text'])
    
    # combining cat name and integer column to get a single output column
    dedup_df[['ml_predicted_cat_name','ml_predicted_integer']] = dedup_df['ml_prediction'].str.split(':', expand=True)
    
    # getting confidence score
    dedup_df['ml_predicted_conf_score'] = np.round_(1/(1+np.exp(-np.round_(np.max(model.decision_function(dedup_df['processed_text'].values), axis=1), decimals=2))),decimals=2)
    # dropping rest of the columns
    # dedup_df.drop(['merchant_uuid','entity_uuid','source','merchant_type_analytics','external_id','time_stamp','tax_labels','tax_label_ts','processed_text','combined_text','ml_prediction'], inplace=True, axis=1)
    dedup_df.drop(['merchant_uuid','store_name','entity_uuid','uber_merchant_type','published_merchant_category_path',\
                   'raw_merchant_category_path','tax_labels','updated_at','processed_text','combined_text','ml_prediction','product_name_lower','product_description_lower','establishment_type_lower'], inplace=True, axis=1)
    
    datetime_str = datetime.now().strftime('%d%b%Y_%H%M%S')
    filename_inca_data = config.FILENAME_DEDUP+"_"+datetime_str+".csv"
    
    print(filename_inca_data)
    inca_path_data_dedup = os.path.join(config.DATA_PATH_INCA_DEDUP, filename_inca_data)
    
    # saving dedup file 
    dedup_df.to_csv(inca_path_data_dedup, index=False)
    FILENAME_DEDUP = config.FILENAME_DEDUP+"_"+datetime_str+".csv"
    items = sub_folder.get_items()
    flag = False
    logger.debug('Uploading dedup file with ml prediction data in BOX....')

    for item in items:
        # if file name exists, update the content
        if item.name == FILENAME_DEDUP:
            chunked_uploader = client.file(item.id).get_chunked_uploader(inca_path_data_dedup)
            updated_file = chunked_uploader.start()
            print(f'Dedup File "{updated_file.name}" updated to Box with file ID {updated_file.id}')
            logger.debug(f'Dedup File "{updated_file.name}" updated to Box with file ID {updated_file.id}')
            flag = True
            break

    # if file doesn not exists upload the file
    if not flag:
        uploaded_file = sub_folder.upload(inca_path_data_dedup)
        print('Dedup File "{0}" has been uploaded'.format(uploaded_file.name))
        logger.debug(f'Dedup File "{uploaded_file.name}" uploaded to Box with file ID {uploaded_file.id}')
    return None

# Original version

In [8]:
def raw(df,sub_folder):
    logger.info("Incoming raw data size: {}".format(df.shape))
    # combining input features to a single column 
    df['combined_text'] = df[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
    # apply data preprocessing steps on the combined_text column
    df['processed_text'] = df['combined_text'].map(lambda s:preprocess_text(s))
    # getting the prediction
    df['ml_prediction'] = model.predict(df['processed_text'])
    # combining cat name and integer column to get a single output column
    df[['ml_predicted_cat_name','ml_predicted_integer']] = df['ml_prediction'].str.split(':', expand=True)
    # getting confidence score
    df['ml_predicted_conf_score'] = np.round_(1/(1+np.exp(-np.round_(np.max(model.decision_function(df['processed_text'].values), axis=1), decimals=2))),decimals=2)
    # dropping irrelevent columns
    df.drop(['combined_text','processed_text','ml_prediction'], inplace=True, axis=1)
    datetime_str = datetime.now().strftime('%d%b%Y_%H%M%S')
    filename_inca_data = config.FILENAME_RAW+"_"+datetime_str+".csv"
    print(filename_inca_data)
    inca_path_data_raw = os.path.join(config.DATA_PATH_INCA_RAW, filename_inca_data) 
    # saving output in a csv
    df.to_csv(inca_path_data_raw, index=False)
    FILENAME_RAW = config.FILENAME_RAW+"_"+datetime_str+".csv"
    items = sub_folder.get_items()
    flag = False
    logger.debug('Uploading Raw file with ml prediction data in BOX....')

    for item in items:
        # if file name exists, update the content
        if item.name == FILENAME_RAW:
            chunked_uploader = client.file(item.id).get_chunked_uploader(inca_path_data_raw)
            updated_file = chunked_uploader.start()
            print(f'Raw File "{updated_file.name}" updated to Box with file ID {updated_file.id}')
            logger.debug(f'Raw File "{updated_file.name}" updated to Box with file ID {updated_file.id}')
            flag = True
            break

    # if file doesn not exists upload the file
    if not flag:
        uploaded_file = sub_folder.upload(inca_path_data_raw)
        print('Raw File "{0}" has been uploaded'.format(uploaded_file.name))
        logger.debug(f'Raw File "{uploaded_file.name}" uploaded to Box with file ID {uploaded_file.id}')
    return None

In [9]:
try:
    qr = qr_client(user_email=config.USER_EMAIL)
    cursor = qr.execute('presto', query=HIVE_QUERY, pii=False, timeout=10800)
    end_time = time.time()
    interval = time.strftime('%H:%M:%S', time.gmtime(end_time-start_time))
    logger.debug('Hive data load into dataframe {}'.format(interval))
    df = cursor.to_pandas()
    datetime_str_const = datetime.now().strftime('%d%b%Y_%H%M%S')
    folder = client.folder(folder_id=config.OUTPUT_FOLDER_ID_INCA)
    subfolder_naming_format = 'INCA_QB_'+datetime_str_const
    sub_folder = folder.create_subfolder(name=subfolder_naming_format)
    if len(df) > 0:
        dedup(df,sub_folder)
        raw(df,sub_folder) 
        message_success = {"personalizations": [{
    "to": [{
            "email": "phegde1@ext.uber.com"
        }, {
            "email": "ssethu@ext.uber.com"
        },
           {
            "email": " amoham117@ext.uber.com"
        },
            {
            "email": "pyagat@ext.uber.com"
        },
            {
            "email": "rsujay@ext.uber.com"
        },
            {
            "email": " vbanap@ext.uber.com"
        },
            {
            "email": "rkunta@ext.uber.com"
        },
            {
            "email": " iacoe-email-notification-service@uber.pagerduty.com"
        }],
    "cc": [ {
            "email": "sshaik26@ext.uber.com"
        },
        {
            "email": "spothi1@ext.uber.com"
        },
        {
            "email": "dampol@ext.uber.com"
        },
        {
            "email": "vvemav@ext.uber.com"
        },
        {
            "email": "anevre@ext.uber.com"
        },
        {
            "email": "jghosh2@ext.uber.com"
        }],

            'subject': 'TaxML_INCA Daily Run ML Job Sucess Notification'
        }
    ],
                           
    "from": {
    "email": "ia-coe-support-group@uber.com"
},
    'content': [
        {
            'type': 'text/html',
            "value": "<html><head></head><body>Hello!<br><br>TaxML INCA ML Job  uploaded file in this location(https://uber.app.box.com/folder/"+sub_folder.id+") successfully.<br><br>You can reach out to ia-coe-support-group@uber.com for any questions or concerns.<br><br><br>Thanks,<br><b>Intelligent Automation COE Support Group<b></body></html>"

        }
]
}

        response = sg.client.mail.send.post(request_body=message_success)
        logger.debug('Taxml automated prediction email notification has been sent successfully...')
    else:
        logger.error('No Data Fetched from hive db...')
        message_bussiness_exception = {
    'personalizations': [
        {
    "to": [{
            "email": "phegde1@ext.uber.com"
        }, {
            "email": "ssethu@ext.uber.com"
        },
           {
            "email": " amoham117@ext.uber.com"
        },
            {
            "email": "pyagat@ext.uber.com"
        },
            {
            "email": "rsujay@ext.uber.com"
        },
            {
            "email": " vbanap@ext.uber.com"
        },
            {
            "email": "rkunta@ext.uber.com"
        },
            {
            "email": " iacoe-email-notification-service@uber.pagerduty.com"
        }],
     "cc": [ {
            "email": "sshaik26@ext.uber.com"
        },
        {
            "email": "spothi1@ext.uber.com"
        },
        {
            "email": "dampol@ext.uber.com"
        },
        {
            "email": "vvemav@ext.uber.com"
        },
        {
            "email": "anevre@ext.uber.com"
        },
        {
            "email": "jghosh2@ext.uber.com"
        }],
            'subject': 'TaxML_INCA Daily Run ML Job Failure Notification :: BussinessException'
        }
    ],
    'from': {
        'email': 'ia-coe-support-group@uber.com'
    },
    'content': [
        {
            'type': 'text/html',
            "value": "<html><head></head><body>Hello!<br><br>TaxML_INCA ML Job has been failed with the below error: <br><br> No Data Present in Hive DB.<br><br>Please reach Out To ia-coe-support-group@uber.com For any questions Or concerns.<br><br><br>Thanks,<br><b>Intelligent Automation COE Support Group<b></body></html>"
  
        }
    ]
}

        response=sg.client.mail.send.post(request_body=message_bussiness_exception)
except Exception as e:
    logger.error('Exception details::{}'.format(e), exc_info=True)
    message_technical_exception = {
    'personalizations': [
        {
    "to": [{
            "email": "phegde1@ext.uber.com"
        }, {
            "email": "ssethu@ext.uber.com"
        },
           {
            "email": " amoham117@ext.uber.com"
        },
            {
            "email": "pyagat@ext.uber.com"
        },
            {
            "email": "rsujay@ext.uber.com"
        },
            {
            "email": " vbanap@ext.uber.com"
        },
            {
            "email": "rkunta@ext.uber.com"
        },
            {
            "email": " iacoe-email-notification-service@uber.pagerduty.com"
        }],
     "cc": [ {
            "email": "sshaik26@ext.uber.com"
        },
        {
            "email": "spothi1@ext.uber.com"
        },
        {
            "email": "dampol@ext.uber.com"
        },
        {
            "email": "vvemav@ext.uber.com"
        },
        {
            "email": "anevre@ext.uber.com"
        },
        {
            "email": "jghosh2@ext.uber.com"
        }],
            'subject': 'TaxML_INCA Daily Run ML Job Failure Notification :: TechnicalException'
        }
    ],

    'from': {
        'email': 'ia-coe-support-group@uber.com'
    },
    'content': [
        {
            'type': 'text/html',
            "value": "<html><head></head><body>Hello!<br><br>TaxML_INCA ML Job has been failed with the below error: <br><br>Error Message: "+str(e)+".<br><br>Please reach Out To ia-coe-support-group@uber.com For any questions Or concerns.<br><br><br>Thanks,<br><b>Intelligent Automation COE Support Group<b></body></html>"
  
        }
    ]
}
    response = sg.client.mail.send.post(request_body=message_technical_exception)
finally:
    end_time = time.time()
    interval = time.strftime('%H:%M:%S', time.gmtime(end_time-start_time))
    logger.debug('Total program execution time for ml prediction TAXML INCA {}\n\n\n\n'.format(interval))

08/22/2023 10:49:43 AM Send empty tier_metadata {} to Queryrunner V2.
08/22/2023 10:49:43 AM [93m [Polling] fd0d8d26-0d26-41f8-9f7a-5624623ff556 [0m
08/22/2023 10:49:43 AM [93m [Status] created [0m
08/22/2023 10:49:44 AM [93m [Status] finished auth check [0m
08/22/2023 10:49:45 AM [93m [Status] started waiting to execute [0m
08/22/2023 10:50:51 AM [93m [Status] started execution [0m
08/22/2023 10:59:47 AM [93m [Status] completed success [0m
08/22/2023 10:59:47 AM [92m [Query Success] completed success [0m
2023-08-22 10:59:47,403 main DEBUG Hive data load into dataframe 00:10:04
2023-08-22 11:08:01,153 main INFO Incoming dedup data size: (87858, 17)


(87858, 17)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

Input_Deduped_22Aug2023_110804.csv


2023-08-22 11:08:05,031 main DEBUG Uploading dedup file with ml prediction data in BOX....
2023-08-22 11:08:10,683 main DEBUG Dedup File "Input_Deduped_22Aug2023_110804.csv" uploaded to Box with file ID 1286872556898
2023-08-22 11:08:10,692 main INFO Incoming raw data size: (900000, 17)


Dedup File "Input_Deduped_22Aug2023_110804.csv" has been uploaded
Input_RawData_22Aug2023_111642.csv


2023-08-22 11:16:56,059 main DEBUG Uploading Raw file with ml prediction data in BOX....
2023-08-22 11:17:57,196 main DEBUG Raw File "Input_RawData_22Aug2023_111642.csv" uploaded to Box with file ID 1286890780380


Raw File "Input_RawData_22Aug2023_111642.csv" has been uploaded


2023-08-22 11:17:57,526 main DEBUG Taxml automated prediction email notification has been sent successfully...
2023-08-22 11:17:57,527 main DEBUG Total program execution time for ml prediction TAXML INCA 00:28:14




