In [98]:
import pandas as pd
import re
import numpy as np
import unicodedata
from IPython.display import display, HTML

pd.set_option("display.max_columns", 100)

In [99]:
def convert_to_lower_case(data):
    if type(data) is dict:
        for k, v in data.items():
            if type(v) is str:
                data[k] = v.lower()
            elif type(v) is list:
                data[k] = [x.lower() for x in v]
            elif type(v) is dict:
                data[k] = convert_to_lower_case(v)
    return data

In [128]:
files_to_read = [
    {
        'output_file' : '2016-17_pearl', \
        'year': '2016-2017', \
        'xls_file' : 'appendix_18_reconciliation_sheets_by_company_pearl.xlsx', \
        'name_cols': 'A:B', \
        'data_cols' : 'A:Z', \
        'data_skip_rows' : 4, \
        'stop_at' : '1000'
    },
    {
        'output_file' : '2016-17_gems_jade', \
        'year': '2016-2017', \
        'xls_file' : 'appendix_18_reconciliation_sheets_by_company_gems_and_jade.xlsx', \
        'name_cols': 'A:B', \
        'data_cols' : 'A:Z', \
        'data_skip_rows' : 4, \
        'stop_at' : '1000'
    },
    {
        'output_file' : '2016-17_gems_jade_final', \
        'year': '2016-2017', \
        'xls_file' : 'appendix_18_reconciliation_sheets_by_company_gems_and_jade_final.xlsx', \
        'name_cols': 'A:B', \
        'data_cols' : 'A:Z', \
        'data_skip_rows' : 4, \
        'stop_at' : '1000', \
        'copy_currency_from_col_6': True
    }
#     {
#         'output_file' : '2016-17_oil_gas', \
#         'year': '2016-2017', \
#         'xls_file' : 'appendix_18_reconciliation_sheets_by_company_oil_and_gas.xlsx', \
#         'name_cols': 'A:B', \
#         'data_cols' : 'A:Z', \
#         'data_skip_rows' : 4, \
#         'stop_at' : '1000'
#     },
#     {
#         'output_file' : '2016-17_other_minerals', \
#         'year': '2016-2017', \
#         'xls_file' : 'appendix_18_reconciliation_sheets_by_company_minerals.xlsx', \
#         'name_cols': 'A:B', \
#         'data_cols' : 'A:Z', \
#         'data_skip_rows' : 4, \
#         'stop_at' : '1000'
#     },
#     {
#         'output_file' : '2016-17_oil_gas_transport', \
#         'year': '2016-2017', \
#         'xls_file' : 'appendix_18_reconciliation_sheets_by_company_oil_and_gas_transportation.xlsx', \
#         'name_cols': 'A:B', \
#         'data_cols' : 'A:Z', \
#         'data_skip_rows' : 4, \
#         'stop_at' : '1000'
#     }
]


'''
files_to_read = [
    {
        'output_file' : '2015-16_other_minerals', \
        'year': '2015-2016', \
        'xls_file' : 'Annex 18 - Reconciliation sheets Other minerals 15-16.xlsx', \
        'name_cols': 'B:E', \
        'data_cols' : 'A:M', \
        'data_skip_rows' : 5, \
        'stop_at' : '28'
    }
]


    {
        'output_file' : '2014-15_oil_gas_transport', \
        'year': '2014-2015', \
        'xls_file' : 'Annex 18 - Reconciliation sheets Oil & Gas transp 15-16.xlsx', \
        'name_cols': 'B:E', \
        'data_cols' : 'A:M', \
        'data_skip_rows' : 5, \
        'stop_at' : '1000'
    },

'''

#name_cols = 'B:D'
#data_cols = 'A:M'
#data_skip_rows = 4

"\nfiles_to_read = [\n    {\n        'output_file' : '2015-16_other_minerals',         'year': '2015-2016',         'xls_file' : 'Annex 18 - Reconciliation sheets Other minerals 15-16.xlsx',         'name_cols': 'B:E',         'data_cols' : 'A:M',         'data_skip_rows' : 5,         'stop_at' : '28'\n    }\n]\n\n\n    {\n        'output_file' : '2014-15_oil_gas_transport',         'year': '2014-2015',         'xls_file' : 'Annex 18 - Reconciliation sheets Oil & Gas transp 15-16.xlsx',         'name_cols': 'B:E',         'data_cols' : 'A:M',         'data_skip_rows' : 5,         'stop_at' : '1000'\n    },\n\n"

In [129]:
df_all = pd.DataFrame({'company_name' : [],'name_of_revenue_stream' : [], \
                       'paid_to' : [], 'payment_category' : [], 'units' : [], \
                       'per_company_original' : [], \
                       'per_company_adjust' : [], \
                       'per_company_final' : [], \
                       'per_government_original' : [], \
                       'per_government_adjust' : [], \
                       'per_government_final' : [], \
                       'final_difference' : [], \
                       'comment' : []
                      })

key_terms = { 'payment_category' : ['Payments in kind', 'Payments in cash', 'B- Unilateral company disclosures'], \
              'units' : ['In Barils', 'In Mscf', 'Gold in T.oz', 'Tin in MT', 'In (Please mention the commodity)', \
                       'Antimony Ore', 'NA', 'Copper', 'Copper in MT', 'Ferro Nickel']
            }
key_terms = convert_to_lower_case(key_terms)

In [130]:
def remove_duplicate_name_segements(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [131]:
def join_column_titles(text):
    to_return = ""
    for t in text:
        if not isinstance(t, float):
            t = t.strip()
            to_return = '.'.join([to_return,(''.join(i for i in t if ord(i)<128))])
    #print("FINAL: " + to_return)
    
    unique_elements = remove_duplicate_name_segements(to_return.split("."))
    to_return = '.'.join(unique_elements)
    
    # remove first '.' from the title
    return to_return[1:]

In [132]:
def rename_duplicate_column_titles(columns):
    unique_titles = []
    title_counts = {}
    
    for c in columns:
        if c in unique_titles:
            title_counts[c] += 1
            unique_titles.append(c + "." + str(title_counts[c]))
        else:
            title_counts[c] = 0
            unique_titles.append(c)
            
    return unique_titles

In [133]:
def total_in_mmk(row,variable,euro_xrate=1378.36,usd_xrate=1257):
    # Exchange rate taken from MEITI 2016-17 Report
    # USD$ 1 = MMK 1,257.00 
    # EURO 1 = MMK 1,378.36
    
    mmk = euro = usd = 0
    
    if variable + '.mmk' in row.index:
        mmk = row[variable + '.mmk']
    if variable + '.euro' in row.index:
        euro = row[variable + '.euro']
    if variable + '.usd' in row.index:
        usd = row[variable + '.usd']
        
    return mmk + euro*euro_xrate + usd*usd_xrate

In [134]:
def add_sheet_to_main_df(main_df,current_df, company_name,company_number,key_terms):

    
    current_payment_category = ""
    current_units = "MMK"
    current_paid_to = ""

            
    for index, row in current_df.iterrows():
        
        index_col = 'n'
        description_col = 'description of payment'
        
        # skip row if '°' is included in it because that means 
        #  there's an extra row of French titles in the table
        unicode_row = row.to_string().encode("utf-8")        
        if u'\xb0' in unicode_row.decode('windows-1252'):
            continue
        
        # if the index column in empty, that means it's not a data-row
        if str(row[index_col])  == 'nan':
            index_col = description_col
        
        #print(str(row[description_col]).lower() + " | " + str(row[index_col]).lower())
        
        if str(row[index_col]).lower() in key_terms['payment_category']:
            current_payment_category = str(row[index_col]).lower()
        elif str(row[index_col]).lower() in key_terms['units']:
            current_units = str(row[index_col]).lower()
            current_paid_to = ""
        elif not str(row[index_col]).replace('.','',1).isdigit():
            current_paid_to = str(row[index_col])
            current_units = "MMK"
        
        
        #per company.payment as disclosed by company.mmk	per company.company adjust.mmk	per company.final.mmk	per government.revenue as disclosed by government.mmk	per government.government adjust.mmk	per government.final.mmk	variance pre-recocniliation.mmk	variance post-reconciliation.mmk	remarks	per company.payment as disclosed by company.euro	per company.company adjust.euro	per company.final.euro	per government.revenue as disclosed by government.euro	per government.government adjust.euro
           
        if str(row['n']).replace('.','',1).isdigit():
            if 'remarks.1' in row.index:
                remarks = str(row['remarks'])+'; '+str(row['remarks.1'])
            else:
                remarks = str(row['remarks'])
            
            to_append = pd.DataFrame({'company_name' : [company_name], \
                  'company_registration_number' : [company_number], \
                  'name_of_revenue_stream' : [row['description of payment']] , \
                  'paid_to' : [current_paid_to], \
                  'payment_category' : [current_payment_category], \
                  'units' : [current_units], \
                  'per_company_original' : [total_in_mmk(row,'per company.payment as disclosed by company')], \
                  'per_company_adjust' : [total_in_mmk(row,'per company.company adjust')], \
                  'per_company_final' : [total_in_mmk(row,'per company.final')], \
                  'per_government_original' : [total_in_mmk(row,'per government.revenue as disclosed by government')], \
                  'per_government_adjust' : [total_in_mmk(row,'per government.government adjust')], \
                  'per_government_final' : [total_in_mmk(row,'per government.final')], \
                  'final_difference' : [total_in_mmk(row,'variance post-reconciliation')], \
                  'comment' : [remarks] \
                })
            
            
            to_append['comment'].fillna('', inplace=True)
            to_append.fillna(0, inplace=True)
            
            main_df = pd.concat([main_df, to_append])
        
    return main_df
    
    

In [135]:
def read_files(files_to_read, df_all): 
    for f in files_to_read:

        output_file = f['output_file']
        year = f['year']
        xls_file = f['xls_file']
        name_cols = f['name_cols']
        data_cols = f['data_cols']
        data_skip_rows = f['data_skip_rows']   
        stop_at = f['stop_at']   
        if 'copy_currency_from_col_6' in f:
            copy_currency_from_col_6 = f['copy_currency_from_col_6']
        else:
            copy_currency_from_col_6 = False
        print(copy_currency_from_col_6)

        xl = pd.ExcelFile(year+'/'+xls_file)
#        '2014-2015/Annex 18 - Reconciliation sheets Gems & Jade 14-15.xlsx')

        print(year+'/'+xls_file)
        sheets_list = xl.sheet_names
    
        print(sheets_list)

        df_all = pd.DataFrame()
        
        for s in sheets_list:
            
            if not (re.search("[A-Z]+\s*\(\d+\)",s) or re.search("[A-Z]+\s*\d+",s)):
                continue
            else:
                match = re.search(r'(\d+)', s)
                print('current sheet number: ' + match.group(1))
                if int(match.group(1)) > int(stop_at):
                    continue

            print(s)
            sheet_name = s

            #print(name_cols)
            name_df = xl.parse(sheet_name, usecols = name_cols,header=None)
            #display(name_df.head())

            company_name = str(name_df.loc[0,1]).strip()
            company_number = str(name_df.loc[1,1]).strip()
            print(company_name, company_number)

            df = xl.parse(sheet_name, skiprows=data_skip_rows, parse_cols = data_cols, header=None)        
            
            #display(df.head())
            for i in df.iloc[0:1]:
                if df.iloc[0:1][i][0] == 'Company':
                    df.iloc[0:1][i][0] = 'per company'
                if df.iloc[0:1][i][0] == 'Government Agency':
                    df.iloc[0:1][i][0] = 'per government'
                    
            
            # fill out cells with merged column headers
            df.dropna(how='all',axis=1,inplace=True)
            if copy_currency_from_col_6:
                df.iloc[2] = df.iloc[6]
            df.iloc[0:2] = df.iloc[0:2].fillna(method='ffill', axis=0)
            df.iloc[0:2] = df.iloc[0:2].fillna(method='ffill', axis=1)
            #display(df.head())
            df.columns = df.iloc[0:3].apply(join_column_titles, axis=0)
            df = df.iloc[3:]
            df = df.reset_index(drop=True)
            df.columns = rename_duplicate_column_titles(df.columns)
            df.columns = [x.lower() for x in df.columns]
            df.rename(columns={'final difference.final': 'final difference', \
                               'comment.final': 'comment'}, inplace=True)
            
            #display(df.head())
            if 's/n' in df.columns:
                df.rename(columns={'s/n': 'n'}, inplace=True) 
            if 'description of payment.description' in df.columns:
                df.rename(columns={'description of payment.description': \
                                   'description of payment'}, inplace=True)
                
            for col in df.columns:
                new_col_name = col.replace("governement", "government")
                new_col_name = new_col_name.replace("ajust", "adjust")
                df.rename(columns={col: new_col_name}, inplace=True)
            
            
            
            if 'per company.payment as disclosed by company.mmk' in df.columns:
                df.rename(columns={'per company.initial': 'per company.original'}, inplace=True)
            if 'per government.initial' in df.columns:
                df.rename(columns={'per government.initial': 'per government.original'}, inplace=True)
                
                
            if 'company.initial' in df.columns:
                df.rename(columns={'company.initial': 'per company.original'}, inplace=True)
            if 'government agency.initial' in df.columns:
                df.rename(columns={'government agency.initial': 'per government.original'}, inplace=True)


            #display(df.head())
            # clean name for the company to save individual CSV file if needed
            name = map(lambda x: ''.join(e for e in x if e.isalnum()) , company_name.split(' '))
            name = ' '.join(w for w in name)
            name = re.sub( '\s+', ' ', name.strip())
            filename = re.sub( '\s+', '_', name.strip())+'.csv'
            #         df.to_csv(filename)

            #print(filename)

            df_all = add_sheet_to_main_df(df_all,df, company_name,company_number,key_terms)
    
        df_all = df_all.reset_index(drop=True)
        df_all.to_csv(year+'/'+output_file+'.csv', encoding='utf-8')

        display(df_all.head())


In [136]:
read_files(files_to_read,df_all)

False
2016-2017/appendix_18_reconciliation_sheets_by_company_pearl.xlsx
['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11']
current sheet number: 1
C1
Annawar Pearl Company Limited 109604496
current sheet number: 2
C2
Aquagold Myanmar Co., Ltd 110185464
current sheet number: 3
C3
Belpearl Myanmar Co., Ltd 105482817
current sheet number: 4
C4
Myanmar Andman Co., Ltd 104651240
current sheet number: 5
C5
Myanmar Atlantic Co., Ltd 109939404
current sheet number: 6
C6
Myanmar Tasaki Co., Ltd 22FC/2001-2002
current sheet number: 7
C7
Niino Pearl Culturing Co., Ltd 1638/1999-2000
current sheet number: 8
C8
Orient Pearl Co., Ltd (Jalan) 1384/1998-1999
current sheet number: 9
C9
Orient Pearl Co., Ltd (Zinyaw) 1384/1998-1999
current sheet number: 10
C10
Pyae Phyo Tun Co., Ltd Pyi Phyo Tun International Company Limited
current sheet number: 11
C11
Pyae Sone Htet Myint Co., Ltd 100108143


Unnamed: 0,company_name,company_registration_number,name_of_revenue_stream,paid_to,payment_category,units,per_company_original,per_company_adjust,per_company_final,per_government_original,per_government_adjust,per_government_final,final_difference,comment
0,Annawar Pearl Company Limited,109604496,Production Split (In Kind),"State Owned Enterprises (ME1, ME2)",,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,Annawar Pearl Company Limited,109604496,Custom Duties,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,Annawar Pearl Company Limited,109604496,Commercial Tax on Imported Capital Equipment,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,Annawar Pearl Company Limited,109604496,Commercial Tax on Imports on Raw Materials and...,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,Annawar Pearl Company Limited,109604496,Corporate Income Tax,Inertal Revenue Department (IRD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


False
2016-2017/appendix_18_reconciliation_sheets_by_company_gems_and_jade.xlsx
['C(1)', 'C(2)', 'C(3)', 'C(4)', 'C(5)', 'C(6)', 'C(7)', 'C(8)', 'C(9)', 'C(10)', 'C(11)', 'C(12)', 'C(13)', 'C(14)', 'C(15)', 'C(16)', 'C(17)', 'C(18)', 'C(19)', 'C(20)', 'C(21)', 'C(22)', 'C(23)', 'C(24)', 'C(25)', 'C(26)', 'C(27)', 'C(28)', 'C(29)', 'C(30)', 'C(31)', 'C(32)', 'C(33)', 'C(34)', 'C(35)', 'C(36)', 'C(37)', 'C(38)', 'C(39)', 'C(40)', 'C(41)', 'C(42)', 'C(43)', 'C(44)', 'C(45)', 'C(46)', 'C(47)', 'C(48)', 'C(49)', 'C(50)', 'C(51)', 'C(52)', 'C(53)', 'C(54)', 'C(55)', 'C(56)', 'C(57)', 'C(58)', 'C(59)', 'C(60)', 'C(61)', 'C(62)', 'C(63)', 'C(64)', 'C(65)', 'C(66)', 'C(67)', 'C(68)', 'C(69)', 'C(70)', 'C(71)', 'C(72)', 'C(73)', 'C(74)', 'C(75)', 'C(76)', 'C(77)', 'C(78)', 'C(79)', 'C(80)', 'C(81)', 'C(82)', 'C(83)', 'C(84)']
current sheet number: 1
C(1)
(1.1.1) Gems & Jewellery Co.,Ltd 121174545
current sheet number: 2
C(2)
Agga Yadanar Min Yarzar nan
current sheet number: 3
C(3)
Aung Aung Nain

Unnamed: 0,company_name,company_registration_number,name_of_revenue_stream,paid_to,payment_category,units,per_company_original,per_company_adjust,per_company_final,per_government_original,per_government_adjust,per_government_final,final_difference,comment
0,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Production Split (In Kind),"State Owned Enterprises (ME1, ME2)",,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
1,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Custom Duties,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
2,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Commercial Tax on Imported Capital Equipment,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
3,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Commercial Tax on Imports on Raw Materials and...,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
4,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Corporate Income Tax,Inertal Revenue Department (IRD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan


True
2016-2017/appendix_18_reconciliation_sheets_by_company_gems_and_jade_final.xlsx
['C(1)', 'C(2)', 'C(3)', 'C(4)', 'C(5)', 'C(6)', 'C(7)', 'C(8)', 'C(9)', 'C(10)', 'C(11)', 'C(12)', 'C(13)', 'C(14)', 'C(15)', 'C(16)', 'C(17)', 'C(18)', 'C(19)', 'C(20)', 'C(21)', 'C(22)', 'C(23)', 'C(24)', 'C(25)', 'C(26)', 'C(27)', 'C(28)', 'C(29)', 'C(30)', 'C(31)', 'C(32)', 'C(33)', 'C(34)', 'C(35)', 'C(36)', 'C(37)', 'C(38)', 'C(39)', 'C(40)', 'C(41)', 'C(42)', 'C(43)', 'C(44)', 'C(45)', 'C(46)', 'C(47)', 'C(48)', 'C(49)', 'C(50)', 'C(51)', 'C(52)', 'C(53)', 'C(54)', 'C(55)', 'C(56)', 'C(57)', 'C(58)', 'C(59)', 'C(60)', 'C(61)', 'C(62)', 'C(63)', 'C(64)', 'C(65)', 'C(66)', 'C(67)', 'C(68)', 'C(69)', 'C(70)', 'C(71)', 'C(72)', 'C(73)', 'C(74)', 'C(75)', 'C(76)', 'C(77)', 'C(78)', 'C(79)', 'C(80)', 'C(81)', 'C(82)', 'C(83)', 'C(84)']
current sheet number: 1
C(1)
(1.1.1) Gems & Jewellery Co.,Ltd 121174545
current sheet number: 2
C(2)
Agga Yadanar Min Yarzar nan
current sheet number: 3
C(3)
Aung Aung

Unnamed: 0,company_name,company_registration_number,name_of_revenue_stream,paid_to,payment_category,units,per_company_original,per_company_adjust,per_company_final,per_government_original,per_government_adjust,per_government_final,final_difference,comment
0,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Production Split (In Kind),"State Owned Enterprises (ME1, ME2)",,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
1,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Custom Duties,Customs Department (CD),,MMK,2082333.73,-2082333.73,0.0,37149860.73,-37149860.73,0.0,0.0,nan; nan
2,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Commercial Tax on Imported Capital Equipment,Customs Department (CD),,MMK,0.0,0.0,0.0,72007752.7,-72007752.7,0.0,0.0,nan; nan
3,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Commercial Tax on Imports on Raw Materials and...,Customs Department (CD),,MMK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nan; nan
4,"(1.1.1) Gems & Jewellery Co.,Ltd",121174545,Corporate Income Tax,Internal Revenue Department (IRD),,MMK,41883973.0,-41883973.0,0.0,0.0,0.0,0.0,0.0,nan; nan
