# Notes

* Run QC code before compiling
* TO DO: Use pygsheets

In [1]:
import pandas as pd
import numpy as np
import pygsheets
import time
import config
import os

In [2]:
###############
## VARIABLES ##
###############

#######################
## ALWAYS CHECK THIS ##
#######################

# Downloaded filename that will be compiled
# TODO: change to pygsheets to prevent error of using an old copy
dlFile = "Global Gas Plant Tracker (GGPT) - dl 2022-02-18.xlsx"

# Directory to find the downloaded GGPT copy
tmpDir = '/Users/jennymartos/Documents/GEM/GGPT/GGPT copies/'

## User Defined ##
# Every tab to compile
sheets_included = ['Russia', 
                   'Australia and New Zealand', 
                   'Africa (sub-Saharan)', 
                   'China', 
                   'European Union',  
                   'Europe', 
                   'Latin America', 
                   'North America', 
                   'Middle East & North Africa', 
                   'East Asia', 
                   'SE Asia', 
                   'South Asia', 
                   'Turkey', 
                   'Central Asia', 
                   'Western Asia']

# The tab where codes and licensing will be written
coverSheet = 'About'

all_sheets = [] # initialize

# Parent Directory path (where you want to locally store outputs on your computer)
parent_dir = '/Users/jennymartos/Documents/GEM/GGPT/GGPT-Compile'

## Automatic ##
# Today's date
dateToday = time.strftime("%Y_%m_%d")
# New data output directory 
directory = dateToday+"_compiled"

# Path 
dataLoc = os.path.join(parent_dir, directory)

## Storing Outputs ##
# If folder already exists, increment name
i=1
while os.path.isdir(dataLoc):
    dataLoc = os.path.join(parent_dir, directory+str(i))
    i+=1

# Create the directory 
os.mkdir(dataLoc) 
print("Directory '% s' created" % directory+str(i)) 

Directory '2022_02_18_compiled' created1


In [3]:
ggpt_xl = pd.ExcelFile(tmpDir + dlFile)

In [4]:
for sheet in sheets_included:   
    one_sheet = pd.read_excel(ggpt_xl, sheet_name=sheet, dtype={'Unit name':str, 'WEPP location ID': str})
    all_sheets += [one_sheet] # Array of every tab

In [5]:
gas_to_process = pd.concat(all_sheets, sort=False) # concat on an array is fastest merge option
gas_to_process = gas_to_process.reset_index(drop=True)
gas_to_process = gas_to_process.dropna(how='all')

In [6]:
for col in gas_to_process.columns:
    if 'Unnamed: ' in col:
        gas_to_process = gas_to_process.drop(col, axis=1)

In [7]:
def read_and_clean_ggpt_ownership(ggpt_xl):
    df = pd.read_excel(ggpt_xl, sheet_name='owner-parent')
    
    df = df.dropna(subset=['Owner'], how='any')

    for col in df.columns:
        if 'Unnamed: ' in col:
            df = df.drop(col, axis=1)
        
    # exclude empty rows
    df = df[df['Owner'].isna()==False]
            
    ggpt_ownership = df
    print(f"Number of rows after filtering Ownership sheet: {len(df)}")
    
    return ggpt_ownership

In [8]:
ggpt_parents = read_and_clean_ggpt_ownership(ggpt_xl)

Number of rows after filtering Ownership sheet: 3188


In [9]:
# clean up, convert dtypes and values
for df in [ggpt_parents]:
        for col in df.columns:
            df[col] = df[col].replace('', np.nan)

for num in range(1, 5+1):
    owner_pct_col = f'Owner {num} %'
    gas_to_process[owner_pct_col] = gas_to_process[owner_pct_col].astype(str)
    gas_to_process[owner_pct_col] = gas_to_process[owner_pct_col].replace('', np.nan).str.replace('%', '').astype(float)
for num in range(1, 10+1):
    parent_pct_col = f'Parent {num} %'
    try:
        ggpt_parents[parent_pct_col] = ggpt_parents[parent_pct_col].astype(str)
        ggpt_parents[parent_pct_col] = ggpt_parents[parent_pct_col].replace('', np.nan).str.replace('%', '').astype(float)
    except:
        print(f"Exception in trying to convert column {parent_pct_col}")
        print("All columns in df:")
        print(ggpt_parents.columns.tolist())
        print("=======")

In [10]:
ggpt_parents = ggpt_parents.set_index('Owner')

### Condensing Owner and Parent strings

In [11]:
def create_owner_and_parent_strings(gas_to_process, ggpt_parents):
    """
    Works for GOGET & GGPT.
    """
    
    for row in gas_to_process.index:
        owners_str = '' # initialize
        parents_str = '' # initialize
        parents_ref_str = '' #initialize

        for o_num in range(1, 5+1):            
            owner_num = gas_to_process.at[row, f'Owner {o_num}']
            owner_num_fract = gas_to_process.at[row, f'Owner {o_num} %']
            
            # create owners_str
            owners_str = create_owners_string_for_row_and_owner_num(
                gas_to_process, row, owner_num, owners_str, owner_num_fract)
                
            # iterative added to parents_str
            parent_array = create_parent_string_for_one_owner(
                owner_num, owner_num_fract, 
                ggpt_parents, parents_str, parents_ref_str)
            parents_str = parent_array[0]
            parents_ref_str = parent_array[1]
            
        # clean up ending
        owners_str = owners_str.strip('; ')
        parents_str = parents_str.strip('; ')
        
        # put into gas_to_process
        gas_to_process.at[row, 'Owner'] = owners_str
        gas_to_process.at[row, 'Parent'] = parents_str
        gas_to_process.at[row, "Parent [ref]"] = parents_ref_str
        
    return gas_to_process

In [12]:
def create_owners_string_for_row_and_owner_num(
    gas_to_process, row, owner_num, owners_str, owner_num_fract):
    
    if pd.isna(owner_num) or owner_num == '':
        pass
        
    else:
        owner_num_pct_str = convert_owner_fract_to_pct(owner_num_fract)     
        if owner_num.lower() == 'other':
            owner_num = owner_num.lower()
        else:
            pass
        
        # fill in owner & owner %
        owners_str += f"{owner_num} [{owner_num_pct_str}]; "
    
    return owners_str

In [13]:
def create_parent_string_for_one_owner(
    owner_num, owner_num_fract, ggpt_parents, parents_str, parents_ref_str):    
    """ 
    For each owner, look up parents in sheet ggpt_parents
    """
    
    if owner_num in ggpt_parents.index:
        for p_num in range(1, 10+1):
            parent_num = ggpt_parents.at[owner_num, f'Parent {p_num}']
            try:
                if pd.isna(parent_num):
                    pass
                else:
                    # get share of owner that parent owns
                    parent_num_fract = ggpt_parents.at[owner_num, f'Parent {p_num} %']

                    # calculate fractional ownership of the O&G unit for this parent
                    parent_num_own_unit_fract = owner_num_fract * parent_num_fract

                    parent_num_own_unit_pct = convert_owner_fract_to_pct(parent_num_own_unit_fract)
                    parent_str = f"{parent_num} [{parent_num_own_unit_pct}]; "

                    # add to collection (parents_str)
                    parents_str += parent_str
                    parents_ref_str = str(ggpt_parents.at[owner_num, 'Owner-Parent [ref]'])
                
    
            except:
                print(f"Problem with parent_num: {parent_num}")
                print(f"ggpt_parents.at[owner_num, f'Parent {p_num}']: {ggpt_parents.at[owner_num, f'Parent {p_num}']}")
                print()                            

    elif pd.isna(owner_num) or owner_num == '':
        pass

    elif owner_num.lower() == 'other':
        parent_num = f'other [{owner_num_fract}]; '

    else:
        print("Error!" + f" Owner isn't in ggpt_parents: {owner_num}")
    
    parent_array = [parents_str, parents_ref_str]
    return parent_array




In [14]:
def convert_owner_fract_to_pct(share_fract):
    if pd.isna(share_fract):
        share_pct_str = 'unknown %'
    elif type(share_fract) == np.float64:
        share_pct_str = "{:.1f}".format(share_fract*100) + '%'
        share_pct_str = share_pct_str.replace('.0%', '%')
    else:
        print("Error!" + f" Owner fract was neither nan nor float; share_fract: {share_fract} & its type: {type(share_fract)}")
        share_pct_str = '____' # placeholder

    return share_pct_str

In [15]:
gas_to_process = create_owner_and_parent_strings(gas_to_process, ggpt_parents)

## Columns to keep

In [16]:
# Columns in the compiled spreadsheet, removing things like "Researcher" and "Notes"
final_cols = ['Wiki URL', 'Country', 'Plant name', 'Plant name (local script)', 'Unit name', 'Fuel', 'Capacity elec. (MW)', 
              'Status', 'Technology', 'CHP', 'Start year', 'Retired year', 'Planned retire', 'Owner', 'Parent', 'Latitude', 
              'Longitude', 'Location accuracy', 'Region', 'City', 'Local area (taluk, county)', 'Major area (prefecture, district)', 
              'Subnational unit (province, state)', 'Other IDs (location)', 'Other IDs (unit)', 
              'Other plant names', 'Captive [heat, power, both]', 'Captive industry type',
              'Captive non-industry use [heat, power, both, none]', 'GEM location ID', 'GEM unit ID']

In [17]:
gas_to_process = gas_to_process[final_cols]

In [18]:
# add wiki URLS
for row in gas_to_process.index:
    plant_name = str(gas_to_process.at[row,'Plant name']).replace(' ','_').replace('/', '%2F')
    if gas_to_process.at[row,'Wiki URL']=='' or pd.isna(gas_to_process.at[row,'Wiki URL']) == True:
        wiki_URL = str('https://www.gem.wiki/') + str(plant_name)
        gas_to_process.at[row,'Wiki URL'] = wiki_URL

In [19]:
gas_to_process = gas_to_process.sort_values(by=['Region', 'Country', 'Plant name', 'Unit name'])

In [20]:
gas_to_process = gas_to_process.reindex(columns = final_cols)

## Exporting

In [21]:
# export variables
save_timestamp = time.strftime('%Y-%m-%d', time.localtime())
monthYear = time.strftime('%B %Y', time.localtime())
monthDayYear = time.strftime('%B %d, %Y', time.localtime())
print(monthYear)
year = time.strftime('%Y', time.localtime())

February 2022


In [22]:
# write multiple tabs (pip install xlsxwriter)
# https://xlsxwriter.readthedocs.io/example_pandas_multiple.html
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(os.path.join(dataLoc, 'Global Gas Plant Tracker (GGPT) completed '+save_timestamp+'.xlsx'), engine='xlsxwriter')

# Keep the metadata tab in front of worksheet
about_tab = pd.read_excel(os.path.join(tmpDir, dlFile), sheet_name=coverSheet, header=0, na_filter=False)

# copyright etc
cp = "This data is licensed by Global Energy Monitor under a Creative Commons Attribution Non-Commercial Share Alike 4.0 International license (CC BY-NC-SA 4.0)"

contact = 'Contact: Jenny Martos, Project Manager, Global Energy Monitor - jenny.martos@globalenergymonitor.org'

ref = 'Citation: "Global Gas Plant Tracker," Global Energy Monitor, ' + monthYear

title = pd.DataFrame({'Global Gas Plant Tracker':['Global Gas Plant Tracker - ' + monthYear, cp, contact, ref]})

about_tab = pd.concat([title, about_tab]).reset_index(drop = True)
about_tab.to_excel(writer, sheet_name='About', index=False, header=False)

# Add abbreviations
abbreviations = pd.read_excel(os.path.join(tmpDir, dlFile), sheet_name='Abbreviations', header=0, na_filter=False)
abbreviations = abbreviations[['Column','Abbreviation','Full Name (English)']]
abbreviations = abbreviations.rename({'Full Name (English)': 'Definition'}, axis=1)
abbreviations.to_excel(writer, sheet_name = 'Abbreviations', index = False)
# auto adjust column width
for column in abbreviations:
    column_width = max(abbreviations[column].astype(str).map(len).max(), len(column))
    col_idx = abbreviations.columns.get_loc(column)
    writer.sheets['Abbreviations'].set_column(col_idx, col_idx, column_width)
    
# Add column key tab
column_key = pd.read_excel(os.path.join(tmpDir, dlFile), sheet_name='Column key', header=0, na_filter=False)
column_key = column_key.loc[column_key['Column Name'].isin(final_cols)]
column_key = column_key[['Column Name', 'Definition']]
column_key.to_excel(writer, sheet_name = 'Column key', index = False)
# auto adjust column width
for column in column_key:
    column_width = max(column_key[column].astype(str).map(len).max(), len(column))
    col_idx = column_key.columns.get_loc(column)
    writer.sheets['Column key'].set_column(col_idx, col_idx, column_width)

# Add tab of GGPT data
gas_to_process.to_excel(writer, sheet_name='Gas Units', index=False)
# auto adjust column width
for column in gas_to_process:
    column_width = max(gas_to_process[column].astype(str).map(len).max(), len(column))
    col_idx = gas_to_process.columns.get_loc(column)
    writer.sheets['Gas Units'].set_column(col_idx, col_idx, column_width)

# Add parent tab of GGPT data
# parent_metadata = pd.read_excel(os.path.join(tmpDir, dlFile), sheet_name='parent metadata', header=0, na_filter=False)
# parent_metadata.to_excel(writer, sheet_name = 'Parent metadata', index = False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()