In [1]:
import sys
import os 
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import shutil
import pymysql
import pymongo
from pymongo import MongoClient
import sqlalchemy

### Mysql Connection

In [434]:
client=MongoClient('mongodb://localhost:27017')
db = client.raxdb
faresheet = "CXfaresheets_new"

In [435]:
# version checking
sqlalchemy.__version__ 

'1.2.7'

In [436]:
# connecting:
from sqlalchemy import create_engine
path = 'mysql+pymysql://jialuc:test@localhost/genie'
engine = create_engine(path, echo=True)

In [437]:
conn = engine.connect()

2018-08-15 16:09:23,722 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'sql_mode'
2018-08-15 16:09:23,723 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:24,240 INFO sqlalchemy.engine.base.Engine SELECT DATABASE()
2018-08-15 16:09:24,241 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:24,247 INFO sqlalchemy.engine.base.Engine show collation where `Charset` = 'utf8' and `Collation` = 'utf8_bin'
2018-08-15 16:09:24,248 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:24,275 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS CHAR(60)) AS anon_1
2018-08-15 16:09:24,276 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:24,285 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS CHAR(60)) AS anon_1
2018-08-15 16:09:24,286 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:24,289 INFO sqlalchemy.engine.base.Engine SELECT CAST('test collated returns' AS CHAR CHARACTER SET utf8) COLLATE utf8_bin AS anon_1
2018-08-15 16

### Functions 

In [457]:
def get_comm_per(conn, country):
    # get total count
    s = text("select count(*) from cx_dw.comm_sales_tkt where rpt_cntry_cd=:country ")
    result = conn.execute(s, country=country).fetchone()
    total = result[0]
    country = "'"+country+"'"
    print("Total number of tickets: ", total)
    # prepare statement
    stmt = "select comm_rate, eff_comm_rate, count(*), count(*)/"+str(total)+ """ as percentage from cx_dw.comm_sales_tkt where rpt_cntry_cd=""" + country + " group by comm_rate, eff_comm_rate;"
    comm_per = pd.read_sql(stmt, con=conn)
    comm_per.sort_values(by="count(*)", inplace=True)
    return comm_per

In [438]:
# test function with  
comm_per = get_comm_per(conn, 'CN')

2018-08-15 16:09:27,833 INFO sqlalchemy.engine.base.Engine select count(*) from cx_dw.comm_sales_tkt where rpt_cntry_cd=%(country)s 
2018-08-15 16:09:27,834 INFO sqlalchemy.engine.base.Engine {'country': 'CN'}
Total number of tickets:  799586
2018-08-15 16:09:42,634 INFO sqlalchemy.engine.base.Engine DESCRIBE `select eff_comm_rate, count(*), count(*)/799586 as percentage from cx_dw.comm_sales_tkt where rpt_cntry_cd='CN' group by eff_comm_rate;`
2018-08-15 16:09:42,634 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 16:09:42,639 INFO sqlalchemy.engine.base.Engine ROLLBACK
2018-08-15 16:09:42,642 INFO sqlalchemy.engine.base.Engine select eff_comm_rate, count(*), count(*)/799586 as percentage from cx_dw.comm_sales_tkt where rpt_cntry_cd='CN' group by eff_comm_rate;
2018-08-15 16:09:42,644 INFO sqlalchemy.engine.base.Engine {}


In [540]:
def get_comm_tourcodes(conn, country, market_comm):
    country = "'"+country+"'"
    stmt="""select count(*) as count, tour_cd, eff_comm_rate from cx_dw.comm_sales_tkt
    where rpt_cntry_cd="""+country+" and eff_comm_rate <> "+str(market_comm)+"""
    and (eff_comm_rate<>0.0000)
    group by tour_cd, eff_comm_rate;
    """
    stmt = stmt.replace("\n", "")
    print(stmt)
    comm_tourcodes = pd.read_sql(stmt, con=conn)
    return comm_tourcodes

In [541]:
def get_target_comm_tourcodes(conn, country, target_comm):
    country = "'"+country+"'"
    stmt="""select count(*) as count, tour_cd, eff_comm_rate from cx_dw.comm_sales_tkt
    where rpt_cntry_cd="""+country+" and (eff_comm_rate between "+str(target_comm-0.002)+" and "+str(target_comm+0.002)+""") 
    and (eff_comm_rate<>0.0000)
    group by tour_cd, eff_comm_rate;
    """
    stmt = stmt.replace("\n", "")
    print(stmt)
    comm_tourcodes = pd.read_sql(stmt, con=conn)
    return comm_tourcodes

In [408]:
def get_a_tourcode_list(comm_tourcodes):
    # get a list of the tourcodes and count the number of tickets without tourcodes
    comm_tourcode_list = list(comm_tourcodes['tour_cd'].unique())
    try:
        comm_tourcode_list.remove('')
    except:
        print("no need to remove blank")
    return comm_tourcode_list

def get_tourcode_list(comm_list):
    tour_cd_list = []
    for comm in comm_list:
        tour_cd_list += get_a_tourcode_list(comm)
    return list(set(tour_cd_list))      
# tour_cd_list = get_tourcode_list([comm_tourcodes_10P, comm_tourcodes_5P, comm_tourcodes_4P])

In [418]:
def find_tourcodes(country, rate, target_rate=False):
    if target_rate==True:
        comm_list=[]
        for r in rate:
            comm_list.append(get_target_comm_tourcodes(conn, country, r))
        tour_cd_list = get_tourcode_list(comm_list)
    else:
        comm_tourcodes = get_comm_tourcodes(conn, country, rate)
        tour_cd_list = list(set(list(comm_tourcodes['tour_cd'])))
        if '' in tour_cd_list:
            tour_cd_list.remove('')
    return tour_cd_list
#tour_cd_list = find_tourcodes([0.1, 0.05, 0.04])

In [422]:
def get_match_df(db, faresheet, tourcode_list):
    match_docs = [doc for doc in db[faresheet].find({"tourcodes": {"$in": tourcode_list}}, 
                                               {"filename":1, "country":1, "path":1, "tourcodes":1, 
                                                "predictions":1, "classifications":1})]
    match_df = pd.DataFrame(match_docs)
    match_df['gold'] = ''
    match_df['pred'] = ''
    # labelling
    for index, row in match_df.iterrows():
        try:
            match_df.loc[index, 'gold'] = row['classifications']['Commission']
        except:
            print("Can't get commission gold: ", row['filename'])
        match_df.loc[index, 'pred'] = row['predictions']['Commission']
    #match_df.drop(['classifications', 'predictions'], axis=1, inplace=True)
    mask1 = match_df['gold']!='yes'
    mask2 = match_df['gold']!='no'
    match_df = match_df.loc[mask1&mask2]
    return match_df

In [2]:
#match_df = get_match_df(db, faresheet, tour_cd_list)

In [431]:
import random 
def sample_save(match_df, sample=100, country=None):
    if country:
        match_df = match_df.loc[match_df['country']==country]
    match_df.reset_index(inplace=True, drop=True)
    check = match_df.loc[random.sample(range(len(match_df)), sample)]
    tcols=['country', 'filename', 'gold', 'pred', 'path', 'tourcodes']
    check[tcols].to_csv('check list/'+str(country)+"_tourcode_sample_check.csv")
    return check[tcols]

def save(match_df, country=None):
    if country:
        match_df = match_df.loc[match_df['country']==country]
    if len(match_df)>100:
        return sample_save(match_df, 100, country)
    else:
        match_df[['country', 'filename', 'gold', 'pred', 'path', 'tourcodes']].to_csv('check list/'+str(country)+"_tourcode_sample_check.csv")
        return match_df[['country', 'filename', 'gold', 'pred', 'path', 'tourcodes']]

### Production 

In [603]:
cnty = 'BE'

In [604]:
comm_per = get_comm_per(conn, cnty)

2018-08-15 17:32:23,194 INFO sqlalchemy.engine.base.Engine select count(*) from cx_dw.comm_sales_tkt where rpt_cntry_cd=%(country)s 
2018-08-15 17:32:23,195 INFO sqlalchemy.engine.base.Engine {'country': 'BE'}
Total number of tickets:  4366
2018-08-15 17:32:27,542 INFO sqlalchemy.engine.base.Engine DESCRIBE `select comm_rate, eff_comm_rate, count(*), count(*)/4366 as percentage from cx_dw.comm_sales_tkt where rpt_cntry_cd='BE' group by comm_rate, eff_comm_rate;`
2018-08-15 17:32:27,542 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 17:32:27,554 INFO sqlalchemy.engine.base.Engine ROLLBACK
2018-08-15 17:32:27,557 INFO sqlalchemy.engine.base.Engine select comm_rate, eff_comm_rate, count(*), count(*)/4366 as percentage from cx_dw.comm_sales_tkt where rpt_cntry_cd='BE' group by comm_rate, eff_comm_rate;
2018-08-15 17:32:27,558 INFO sqlalchemy.engine.base.Engine {}


In [605]:
comm_per.tail(10)

Unnamed: 0,comm_rate,eff_comm_rate,count(*),percentage
1,0.01,0.01,283,0.0648
0,0.0,0.0,4083,0.9352


In [606]:
tour_cd_list = find_tourcodes(cnty, 0.0)
tour_cd_list 

select count(*) as count, tour_cd, eff_comm_rate from cx_dw.comm_sales_tkt    where rpt_cntry_cd='BE' and eff_comm_rate <> 0.0    and (eff_comm_rate<>0.0000)    group by tour_cd, eff_comm_rate;    
2018-08-15 17:32:33,104 INFO sqlalchemy.engine.base.Engine DESCRIBE `select count(*) as count, tour_cd, eff_comm_rate from cx_dw.comm_sales_tkt    where rpt_cntry_cd='BE' and eff_comm_rate <> 0.0    and (eff_comm_rate<>0.0000)    group by tour_cd, eff_comm_rate;    `
2018-08-15 17:32:33,113 INFO sqlalchemy.engine.base.Engine {}
2018-08-15 17:32:33,119 INFO sqlalchemy.engine.base.Engine ROLLBACK
2018-08-15 17:32:33,122 INFO sqlalchemy.engine.base.Engine select count(*) as count, tour_cd, eff_comm_rate from cx_dw.comm_sales_tkt    where rpt_cntry_cd='BE' and eff_comm_rate <> 0.0    and (eff_comm_rate<>0.0000)    group by tour_cd, eff_comm_rate;    
2018-08-15 17:32:33,123 INFO sqlalchemy.engine.base.Engine {}


['ITC3FF600', 'FR73FF600', 'BRU700AFF603']

In [607]:
match_df = get_match_df(db, faresheet, tour_cd_list)

Can't get commission gold:  BRU700AFF600 BCODE BEAIBVIz EFF 01JUN17-31MAY18_secret.html
Can't get commission gold:  BRU700AFF603 BCODE BEALLXXz EFF 01OCT17-30SEP18_secret.html
Can't get commission gold:  BRU700AFF600 BCODE BEBABYLz EFF 01JUN17-31DEC17_secret.html
Can't get commission gold:  BRU700AFF600 BCODE BEVELDEz EFF 01MAR17-28FEB18_secret.html
Can't get commission gold:  AMS700_&_BRU700_WEF_01OCT16-30SEP17_DEP_01OCT16-30SEP18.html
Can't get commission gold:  FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDING_FSCII___AMSMAS.html
Can't get commission gold:  FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDING_FSCII___AMSMAS.html
Can't get commission gold:  FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP30NOV17.html
Can't get commission gold:  FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP30NOV17.html
Can't get commission gold:  FIT_PAB_15SEP2016-31DEC2017_AMS700_FOR_(DE)FRA.html
Can't get commission gold:  FIT_PAB_MASTERFILE_PENDING_FSCII___Belux_Corpo__WEF_15MAR17-31OCT17_DEP_15MAR17-31OCT18

In [608]:
match_df

Unnamed: 0,_id,classifications,country,filename,path,predictions,tourcodes,gold,pred
0,5b2b57d1eb1c8206a8cd9998,"{'faretype': 'undefined', 'commission': 'undef...",BE,BRU700AFF600 BCODE BEAIBVIz EFF 01JUN17-31MAY1...,Z:\2017\BE\BRU700AFF600 BCODE BEAIBVIz EFF 01J...,{'Commission': 'no'},"[BRU700AFF600, BRU700AFF603]",,no
1,5b2b57d2eb1c8206a8cd99a3,"{'faretype': 'undefined', 'commission': 'undef...",BE,BRU700AFF603 BCODE BEALLXXz EFF 01OCT17-30SEP1...,Z:\2017\BE\BRU700AFF603 BCODE BEALLXXz EFF 01O...,{'Commission': 'no'},[BRU700AFF603],,no
2,5b2b57d1eb1c8206a8cd9999,"{'faretype': 'undefined', 'commission': 'undef...",BE,BRU700AFF600 BCODE BEBABYLz EFF 01JUN17-31DEC1...,Z:\2017\BE\BRU700AFF600 BCODE BEBABYLz EFF 01J...,{'Commission': 'no'},"[BRU700AFF600, BRU700AFF603]",,no
3,5b2b57d2eb1c8206a8cd999f,"{'faretype': 'undefined', 'commission': 'undef...",BE,BRU700AFF600 BCODE BEVELDEz EFF 01MAR17-28FEB1...,Z:\2017\BE\BRU700AFF600 BCODE BEVELDEz EFF 01M...,{'Commission': 'no'},"[BRU700AFF600, BRU700AFF603]",,no
5,5b31bd2ceb1c822d2c45f74c,"{'faretype': 'undefined', 'commission': 'undef...",BNL,AMS700_&_BRU700_WEF_01OCT16-30SEP17_DEP_01OCT1...,Z:\2017\BNL\AMS700 & BRU700 WEF 01OCT16-30SEP1...,{'Commission': 'no'},"[AMS700AFF603, BRU700AFF603, AMS700AFF602, BRU...",,no
6,5b31bd2ceb1c822d2c45f779,"{'faretype': 'undefined', 'commission': 'undef...",BNL,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDIN...,Z:\2017\BNL\FIT PAB 01SEP 2017-31OCT2018-MASTE...,{'Commission': 'no'},"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL...",,no
7,5b31bd2ceb1c822d2c45f721,"{'faretype': 'undefined', 'commission': 'undef...",BE,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDIN...,Z:\2017\BE\FIT PAB 01SEP 2017-31OCT2018-MASTER...,{'Commission': 'no'},"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL...",,no
8,5b31bd2ceb1c822d2c45f77a,"{'faretype': 'undefined', 'commission': 'undef...",BNL,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP3...,Z:\2017\BNL\FIT PAB 01SEP 2017-31OCT2018-MASTE...,{'Commission': 'no'},"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL...",,no
9,5b31bd2ceb1c822d2c45f722,"{'faretype': 'undefined', 'commission': 'undef...",BE,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP3...,Z:\2017\BE\FIT PAB 01SEP 2017-31OCT2018-MASTER...,{'Commission': 'no'},"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL...",,no
10,5b31bd2ceb1c822d2c45f77b,"{'faretype': 'undefined', 'commission': 'undef...",BNL,FIT_PAB_15SEP2016-31DEC2017_AMS700_FOR_(DE)FRA...,Z:\2017\BNL\FIT PAB 15SEP2016-31DEC2017 AMS700...,{'Commission': 'no'},"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL...",,no


In [609]:
save(match_df)

Unnamed: 0,country,filename,gold,pred,path,tourcodes
0,BE,BRU700AFF600 BCODE BEAIBVIz EFF 01JUN17-31MAY1...,,no,Z:\2017\BE\BRU700AFF600 BCODE BEAIBVIz EFF 01J...,"[BRU700AFF600, BRU700AFF603]"
1,BE,BRU700AFF603 BCODE BEALLXXz EFF 01OCT17-30SEP1...,,no,Z:\2017\BE\BRU700AFF603 BCODE BEALLXXz EFF 01O...,[BRU700AFF603]
2,BE,BRU700AFF600 BCODE BEBABYLz EFF 01JUN17-31DEC1...,,no,Z:\2017\BE\BRU700AFF600 BCODE BEBABYLz EFF 01J...,"[BRU700AFF600, BRU700AFF603]"
3,BE,BRU700AFF600 BCODE BEVELDEz EFF 01MAR17-28FEB1...,,no,Z:\2017\BE\BRU700AFF600 BCODE BEVELDEz EFF 01M...,"[BRU700AFF600, BRU700AFF603]"
5,BNL,AMS700_&_BRU700_WEF_01OCT16-30SEP17_DEP_01OCT1...,,no,Z:\2017\BNL\AMS700 & BRU700 WEF 01OCT16-30SEP1...,"[AMS700AFF603, BRU700AFF603, AMS700AFF602, BRU..."
6,BNL,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDIN...,,no,Z:\2017\BNL\FIT PAB 01SEP 2017-31OCT2018-MASTE...,"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL..."
7,BE,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE_PENDIN...,,no,Z:\2017\BE\FIT PAB 01SEP 2017-31OCT2018-MASTER...,"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL..."
8,BNL,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP3...,,no,Z:\2017\BNL\FIT PAB 01SEP 2017-31OCT2018-MASTE...,"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL..."
9,BE,FIT_PAB_01SEP_2017-31OCT2018-MASTERFILE___EXP3...,,no,Z:\2017\BE\FIT PAB 01SEP 2017-31OCT2018-MASTER...,"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL..."
10,BNL,FIT_PAB_15SEP2016-31DEC2017_AMS700_FOR_(DE)FRA...,,no,Z:\2017\BNL\FIT PAB 15SEP2016-31DEC2017 AMS700...,"[BNL200AFF510, BNL210AFF510, BNL100AFF590, BNL..."


### Other markets2 (no tickets for market): SEAMAN, BKI, GUM, ME

In [None]:
docs = [doc for doc in db[faresheet].find({"country": "SEAMAN", ""})]