# MCE Refresh Newsletter

In [None]:
import os, ujson
from collections import OrderedDict
from datetime import datetime, timedelta
from tchannel import thrift
from tchannel.sync import TChannel
from pyspark.sql.functions import col
import StringIO

In [None]:
#user_dir = '/mnt/cephfs/hadoop-compute/phoenix/'.format(os.environ['USER'])
#os.chdir(user_dir)
#print(user_dir)
# print os.listdir(user_dir)
cwd = os.getcwd()
print(cwd)

## Postmaster service

In [None]:
# path to the service being used, postmaster in this case. The postmaster.thrift is an IDL
# file. You can find it at engdoc https://engdocs.uberinternal.com/postmaster/get_started.html
postmaster_service = thrift.load(path="postmaster.thrift".format(cwd), service="postmaster")

# open the ujson file in hyperbahn
# with open("/etc/uber/hyperbahn/hosts.json") as f:
#     known_peers = ujson.load(f)

#     print known_peers
# create a tchannel session in order to make use of the thrift docs
# tchannel = TChannel(name="places",known_peers=known_peers)
tchannel = TChannel(name="places", known_peers=["127.0.0.1:5437"])

In [None]:
# spark.sql("SELECT * FROM rawdata.kafka_hp_places_workflow_controller_pe_se_comparison_nodedup").show()
# spark.sparkContext.getConf().getAll()


## Query for refreshed places last week

In [None]:
current = datetime.now()
last_week = current + timedelta(days=-7)

refresh_query = """
    SELECT DISTINCT
        W.datestr,
        W.msg.channel,
        W.msg.se_id,
        W.msg.pe_id,
        W.msg.has_name_diff,
        W.msg.has_address_location_diff,
        W.msg.has_status_diff,
        T.msg.edit_task_id,
        T.msg.is_new_task,
        W.msg.provider,
        P.country AS country
    FROM
        rawdata.kafka_hp_places_workflow_controller_pe_se_comparison_nodedup W
    LEFT JOIN
      rawdata_user.kafka_hp_places_workflow_controller_edit_task_from_change_request_nodedup T
    ON
        W.msg.request_uuid = T.msg.request_uuid
    JOIN
        (
            SELECT 
                countrycode As country,
                placeuuid AS pe_id
            FROM
                map_creation.mce_places_build
            WHERE
                buidversion in (SELECT max(buidversion) FROM map_creation.mce_places_build)
        ) P
    ON
      W.msg.pe_id = P.pe_id
   WHERE
        T.msg.is_prod_run = true AND
        T.datestr > '{start_date}' AND 
        T.datestr <=  '{end_date}' AND
        W.datestr > '{start_date}' AND 
        W.datestr <=  '{end_date}' AND
        W.msg.has_new_content = true
""".format(start_date = last_week.strftime("%Y-%m-%d"), end_date=current.strftime("%Y-%m-%d"))

#print refresh_query
refreshed_place_df = spark.sql(refresh_query)
#refreshed_place_df.show()

## Place curation tasks

In [None]:
total = refreshed_place_df.count()

uniq_curation_task_count = refreshed_place_df \
    .where(col('edit_task_id').isNotNull()) \
    .select('edit_task_id') \
    .distinct() \
    .count()

new_curation_task_count = refreshed_place_df \
    .filter(refreshed_place_df['is_new_task'] == True) \
    .select('edit_task_id') \
    .distinct() \
    .count()

print 'Total refreshed places: ', total
print 'Unique curation task: ', uniq_curation_task_count
print 'New curation task: ', new_curation_task_count

## Refreshed place channel  breakdown

In [None]:
channel_count_df = refreshed_place_df.groupBy('channel') \
    .count() \
    .orderBy('channel')

channel_count_df.show()

In [None]:
channel_count_dict = channel_count_df.rdd.map(lambda row: row.asDict()).collect()
print channel_count_dict

## Provider channel refresh summary

In [None]:
provider_refresh_count = refreshed_place_df.filter(refreshed_place_df["channel"] == 'PROVIDER').count()
name_diff_count = refreshed_place_df.filter(refreshed_place_df["has_name_diff"] == True).count()
address_diff_count = refreshed_place_df.filter(refreshed_place_df["has_address_location_diff"] == True).count()
status_diff_count = refreshed_place_df.filter(refreshed_place_df["has_status_diff"] == True).count()
print 'Provider refresh count: ', provider_refresh_count
print 'Name diff count: ', name_diff_count
print 'Address diff count: ', address_diff_count
print 'Status diff count: ', status_diff_count

## Provider channel refresh breakdown

In [None]:
provider_breakdown_df = refreshed_place_df \
   .filter(refreshed_place_df["channel"] == 'PROVIDER') \
   .groupBy('country', 'provider', 'is_new_task') \
   .count() \
   .orderBy('country', 'provider', 'is_new_task')

provider_breakdown_dict = provider_breakdown_df.rdd.map(lambda row: row.asDict()).collect()

#provider_breakdown_df.show()

## Other channel refresh

In [None]:
other_refresh_count_df = refreshed_place_df \
    .filter(refreshed_place_df["channel"] != 'PROVIDER') \
    .groupBy('channel') \
    .count() \
    .orderBy('channel')

In [None]:
other_refresh_count_dict = other_refresh_count_df.rdd.map(lambda row: row.asDict()).collect()
print other_refresh_count_dict

## Other channel breakdowns

In [None]:
other_breakdown_df = refreshed_place_df \
    .filter(refreshed_place_df["channel"] != 'PROVIDER') \
    .groupBy('country', 'channel', 'is_new_task') \
    .count() \
    .orderBy('country', 'channel', 'is_new_task')
other_breakdown_dict = other_breakdown_df.rdd.map(lambda row: row.asDict()).collect()

# Email notification

In [None]:
refresh_email_content = StringIO.StringIO()
today_text = current.strftime('%Y-%m-%d')

top_text = '''
    <!DOCTYPE html>
    <html>
        <head>
            <meta charset="utf-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Places MCE Refresh of Week from {week_start} to {week_end}</title>
            <meta name="description" content="Places MCE Refresh Weekly Newletter">
        </head>
        <body>
            <em>This is an autogenerated email on MCE refresh for week from {week_start} to {week_end}</em>

            <div>
                <h2>Weekly refresh totals</h2>
                <div>Total curation tasks: <b>{refreshed_place_total}</b></div>
                <div>Unique curation tasks: <b>{uniq_curation_task_count}</b></div>
                <div>New curation tasks: <b>{new_curation_task_count}</b></div>
            </div>
'''.format(week_start=last_week.strftime("%Y-%m-%d"),
           week_end=today_text, 
           refreshed_place_total=total,
           uniq_curation_task_count=uniq_curation_task_count,
           new_curation_task_count=new_curation_task_count)
refresh_email_content.write(top_text)

# Provider channel refresh
refresh_email_content.write('''
            <div>
                <h2>Provider Refresh</h2>
''')

# Provider channel refresh summary
provider_summary_text = '''
                <div>
                    <h3>Summary</h3>
                    <table>
                        <tr><td>Total: </td><td>{}</td></tr>
                        <tr><td>Name diff: </td><td>{}</td></tr>
                        <tr><td>Address diff: </td><td>{}</td></tr>
                        <tr><td>Status diff: </td><td>{}</td></tr>
                    </table>
                </div>
                '''.format(provider_refresh_count, name_diff_count, address_diff_count, status_diff_count)
refresh_email_content.write(provider_summary_text)

# Provider channel refresh breakdowns start
refresh_email_content.write('''
                <div>
                    <h3>Breakdowns</h3>
                    <div>
                        <table>
                            <tr>
                                <th>Country</th><th>Provider</th><th>Task Type</th><th>Count</th>
                            </tr>''')


for row in provider_breakdown_dict:
    refresh_email_content.write('''
                           <tr>
                                <td>{country}</td><td>{provider}</td><td>{task_type}</td><td>{count}</td>
                           </tr>
                '''.format(country=row['country'],
                           provider=row['provider'],
                           task_type='New' if row['is_new_task'] else 'Updated',
                           count=row['count']))

# Provider channel refresh end
refresh_email_content.write('''
                        </table>
                    </div>
                </div>
            </div>''')


# Other channel refresh
refresh_email_content.write('''
            <div>
            <h2>Other Channels Refresh</h2>
                <div>
                    <h3>Summary</h3>
                    <div>
                        <table>
                            <tr><th>Channel</th><th>Count</th></tr>''')

for row in other_refresh_count_dict:
    refresh_email_content.write('''
                           <tr><td>{channel}</td><td>{count}</td></tr>
                '''.format(channel=row['channel'], count=row['count'])
    )


# Other channel refresh breakdowns start
refresh_email_content.write('''
                        </table>
                    </div>
                </div>
                <div>
                    <h3>Breakdowns</h3>
                    <div>
                        <table>
                            <tr>
                                <th>Country</th><th>Channel</th><th>Task Type</th><th>Count</th>
                            </tr>''')

for row in other_breakdown_dict:
    refresh_email_content.write('''
                           <tr>
                                <td>{country}</td><td>{channel}</td><td>{task_type}</td><td>{count}</td>
                           </tr>
                '''.format(country=row['country'],
                           channel=row['channel'],
                           task_type='New' if row['is_new_task'] else 'Updated',
                           count=row['count']))

    
# Provider channel refresh breakdowns end
refresh_email_content.write('''
                        </table>
                    </div>
                </div>''')

# Bottom text
refresh_email_content.write('''
            </div>
        </body>
    </html>''')

In [None]:
# print refresh_email_content.getvalue()

In [None]:
subject_arg = current.strftime('Places MCE Refresh Email of Week %Y-%m-%d')

rawEmail_struct = postmaster_service.RawEmail(subject = subject_arg, richBody = refresh_email_content.getvalue())
content_arg = postmaster_service.Content(rawEmail = rawEmail_struct)

to_arg = postmaster_service.Recipient(emailAddress = 'maps-places@uber.com')
cc_arg = [postmaster_service.Recipient(emailAddress = 'maps-places-pgms@uber.com')]

recipients_struct = postmaster_service.Recipients(to = to_arg, cc = cc_arg)

fromEmail_arg = 'noreply@uber.com'
messageType_arg = 'internal'
request_struct = postmaster_service.Request(fromEmail = fromEmail_arg, recipients = recipients_struct, content = content_arg, messageType=messageType_arg)

future = tchannel.thrift(postmaster_service.Postmaster.testEmail(request = request_struct))
print future.result(timeout = 30000).body