# CSRD DATASET GENERATOR #
_By: Michiel Tange_,
_Last updated: 07/11/2024_


The purpose of this script is to facilitate finding which companies in the areas of the Sparke & Keane offices can be contacted about CSRD.

It does this by first finding the location coordinates of all the company headquarters using Google Maps' Geocoding API. The company data is extracted from Factset. It then calculates the distances between the companies and the different Sparke & Keane offices. It adds the original Factset data, as well as the calculated data to an Excel output file. Afterwards, it sets up a criteria sheet in the Excel file where the criteria for consideration are defined. Then, each Sparke & Keane office gets added as a separate sheet to the output Excel file with the appropriate Excel formulas filled in for filtering the data according to the specified criteria. This ensures a dynamic file at the end that end-users can play around with.

## Set-up ##

### Imports ###

In [23]:
import string
import openpyxl
import googlemaps
import pandas as pd
from os import listdir
from haversine import haversine

### Objects ###
The following objects will be used throughout the script:
- Sparke & Keane office
- Output file
- Criterion

In [45]:
class office:
    """ a Sparke & Keane office """

    def __init__(self, name, address, location=None):
        """
        initialise an instance of office

        Properties
        ----------
        name : str
            the name of the specific office
        address : str
            the office's address
        location : dict
            dictionary holding the office's location coordinates with 'lng' (longitude) and 'lat' (latitude) as keys
        """

        self.name = name
        self.address = address
        self.location = location

    def calc_dist(self, other_location):
        """ 
        calculate the distance in kilometers between the office and another location using haversine

        args
        ----
        other_location : dict
            dictionary holding the other location's coordinates with 'lng' (longitude) and 'lat' (latitude) as keys

        returns
        -------
        dist : int
            the distance in kilometers between the office's location and the other location
        """

        # put the office coordinates in a tuple
        office_coords = (self.location['lat'], self.location['lng'])
        
        # put the other location coordinates in a tuple
        other_coords = (other_location['lat'], other_location['lng'])

        # calculate the distance using haversine
        dist = int(haversine(office_coords, other_coords))
        
        return dist


class out_file:
    """ an output Excel file """

    def __init__(self, name, full_data, criteria):
        """ 
        initialise an instance of out_file 
        
        Properties
        ----------
        name : str
            the name of the output Excel file (include extension)
        full_data : str
            the name of the sheet within the output Excel file that is to hold the full (unfiltered) data
        criteria : str
            the name of the sheet within the output Excel file that is to hold the filtering criteria 
        """

        self.name = name
        self.full_data = full_data
        self.criteria = criteria


class criterion:
    """ a criterion to be used for filtering the full data """

    def __init__(self, name, type, value, ref_col=None):
        """ 
        initialise an instance of criterion

        Properties
        ----------
        name : str
            the name of the criterion (how it will show up in the output file)
        type : str
            the type of criterion (CSRD or S&K)
        value : int
            the value that needs to be reached for the criterion to be met
        ref_col : str
            the name of the column in the original Factset data that corresponds to the criterion
        """

        self.name = name
        self.type = type
        self.value = value
        self.ref_col = ref_col

### API key ###
Reading in the Google Cloud developer API key.

In [25]:
with open("APIkey.txt", 'r') as file:
    API_key = file.read()

### Client ###

In [26]:
gmaps = googlemaps.Client(key=API_key)

### Data files ###

In [27]:
data_files = listdir("Factset data")

### Global parameters ###

Make a dictionary for the upper case alphabet to use for easier referencing of Excel cels.

In [51]:
alph = list(string.ascii_uppercase)
alph_dict = {}
for i in range(26):
    alph_dict[i] = alph[i]

Define custom functions to be used throughout the script.

In [52]:
def get_coords(location):
    """
    get the longitude and latitude coordinates for a location

    args
    ----
    location : str 
        location for which to get coordinates
    
    returns
    -------
    coordinates : dict 
        dictionary containing longitude and latitude denoted by the keys 'lng' and 'lat'
    """ 
    return gmaps.geocode(location)[0]['geometry']['location']

def to_xl_col(col, dataset, dict=alph_dict):
    """
    find the corresponding Excel column reference for a target column in a dataset. The index of the target column in the dataset denotes the output.
    i.e., the first column of the dataset (index 0) becomes column 'A' in Excel, the second (index 1) becomes column 'B'. Etc.
    This function uses a dictionary of the uppercase alphabet called alph_dict for conversion.

    args
    ----
    col : str
        the name of the target column for which to find the Excel reference
    dataset : pd.DataFrame
        the dataset in which the target column can be found

    returns
    ------
    xl_col : str
        the Excel column reference
    """
    
    data = list(dataset.columns)
    xl_col = dict[data.index(col) + 1]

    return xl_col

Set the addresses of the different office of Sparke & Keane (can be added to / adjusted).

In [29]:
maastricht = office(name=       "SK Maastricht",
                    address=    "Bassin 108, 6211 AK Maastricht, Netherlands")

eindhoven = office( name=       "SK Eindhoven",
                    address=    "Achtseweg Zuid 221, 5651 GW Eindhoven, Netherlands")

nijmegen = office(  name=       "SK Nijmegen",
                    address=    "Oranjesingel 51, 6511 NP Nijmegen, Netherlands")

offices = [maastricht,
           eindhoven,
           nijmegen]

# add the locations to each office
for office in offices:
    office.location = get_coords(office.address)

Set the parameters of the output (Excel) file.

In [46]:
output = out_file(name="CSRD dataset.xlsx",
                  full_data='Ruwe data',
                  criteria='Criteria')

Set the parameters for the criteria.

In [31]:
crit_employees = criterion( name=       'Minimaal aantal werknemers (#)',
                            type=       'CSRD',
                            value=      250,
                            ref_col=    'Number of Employees')

crit_revenue = criterion(   name=       'Minimale omzet (MM$)',
                            type=       'CSRD',
                            value=      50,
                            ref_col=    'Revenue (MM, USD)')

crit_assets = criterion(    name=       'Minimale activa (MM$)',
                            type=       'CSRD',
                            value=      25,
                            ref_col=    'Total Assets (MM) (USD) (USD)')

crit_dist = criterion(      name=       'Maximale afstand tot kantoor (km)',
                            type=       'S&K',
                            value=      50)

criteria = [crit_employees,
            crit_revenue,
            crit_assets,
            crit_dist]

## Pre-processing ##
Cleaning up some weird stuff Factset adds to the Excel file. <span style="color:green">**Run this only once.**</span>

In [265]:
for data_file in data_files:
    filename = data_file
    wb = openpyxl.load_workbook(filename)
    sheet = wb['Companies_Results']
    sheet.delete_rows(1, 3)
    wb.save(filename)

### Loading data ###

In [57]:
data = pd.DataFrame()
for data_file in data_files:
    temp_data = pd.read_excel(f'Factset data\\{data_file}')
    data = pd.concat([data, temp_data], ignore_index=True)

### Finding all unique cities ###

In [58]:
city_countries = data.loc[data['City'] != '-',['City', 'Country']].drop_duplicates() # save unique city-country pairs (some cities occur in multiple countries) - drop all '-' for missing cities
city_countries['City_Country'] = city_countries['City'] + ', ' + city_countries['Country'] # combine both pandas Series into one
cities = city_countries['City_Country'].to_numpy() # save as a numpy array for faster iterating

### Initializing empty dictionary ###
The dictionary is going to hold the cities as the keys, and their coordinates as the values. The coordinates are also a dictionary with keys: 'lng' & 'lat' for longitude and latitude respectively, and the actual coordinates as the values.

In [59]:
city_coords = {}

## Distances ##

### Getting the location coordinates ###
Getting the coordinates (longitude and latitude) for each of the unique cities using Google's Geocoding API.

In [60]:
for city in cities:
    try:
        coords = get_coords(city)
    except IndexError as e:
        print(f'{e} for city: {city}') # error handling for cities (city-country pairs) Google couldn't find
    city_coords[city] = coords


### Calculate the distances between cities and offices ###
Calculate the distances between a city location (coordinates) and the Sparke & Keane offices (coordinates). The distances are calculated using the haversine formula for calculating distances between points on a sphere:
$$
a = \sin^{2}(\frac{\Delta \phi}{2}) + \cos \phi_1 \cdot \cos \phi_2 \cdot \sin^{2}(\frac{\Delta \lambda}{2})
$$
$$
c = 2 \cdot \arctan 2(\sqrt{a}, \sqrt{1-a})
$$
$$
d = \textrm{R} \cdot c
$$
Where $\phi$ is latitude, $\lambda$ is longitude, and $\textrm{R}$ is the radius of Earth (approximately 6,371km). Additionally, $a$ is the haversine formula, which determines the haversine of the central angle connecting both points through the Earth's centre. $c$ is the haversine function, which determines the central angle. $d$ calculates the distance between both points using the radius of the Earth.

In [61]:
dist_to_offices = {} # save the distances to each office in a dictionary with the city as the key, and a dictionary of offices as the value

for city in city_coords.keys():
    temp = {} # save the distance to an office in a dictionary with the office as the key
    for office in offices:
        temp[office.name] = office.calc_dist(city_coords[city])
    dist_to_offices[city] = temp

### Adding calculated data ###
Add the distance to each Sparke & Keane office from each company to the dataset.

In [62]:
add_data = {}
for i in range(len(data['Company Entity Id'])):
    if data['City'][i] != '-':
        city_country = data['City'][i] + ', ' + data['Country'][i] # combine the city and country data to facilitate searching the dist_to_offices dictionary
        temp = {}
        for office in dist_to_offices[city_country].keys():
            temp[f'afstand tot {office}'] = dist_to_offices[city_country][office] # the f-string denotes the column headers of the added data
        add_data[data['Company Entity Id'][i]] = temp

add_data_df = pd.DataFrame.from_dict(add_data, orient='index') # convert the dictionary to a DataFrame
data = data.join(add_data_df, on='Company Entity Id')

### Saving the data back to Excel ###

In [63]:
data.to_excel(output.name, sheet_name=output.full_data, index_label="index")

## Per office output ##
Create per-office sheets in the Excel file where all companies that meet CSRD (and Sparke & Keane) criteria are saved.

### Criteria sheet ###
Set up a sheet in the Excel file where the filter criteria (CSRD and Sparke & Keane) are specified.

In [64]:
wb = openpyxl.load_workbook(output.name)
wb.create_sheet(output.criteria)

i = 1
for criterion in criteria:
    wb[output.criteria][f'A{i}'] = criterion.name
    wb[output.criteria][f'B{i}'] = criterion.value
    i += 1

### Office sheets ###
Set up the office sheets. Each sheet shows which companies meet the criteria for that specific office. For a company to be included it has to meet at least 2 out of the 3 CSRD criteria, as well as the Sparke & Keane criteria of distance to an office. The logic gate for meeting at least 2 out of the 3 CSRD criteria looks as follows: 
$$a\text{ } \&\&\text{ } (b || c)\text{ } ||\text{ } (b \&\& c)$$
This will be build into the Excel file as a Filter function, so future users can make dynamic changes if they wish.

<span style="color:green">**NOTE: This is not future proof to a changing number of CSRD criteria, or a changing logic to qualify (i.e., if 2 out of 3 criteria changes, this won't change with).**</span>

Set up the logic gates. One logic gate for each CSRD criteria, and $n$ number of logic gates for Sparke & Keane offices (as each office gets its own sheet).

In [65]:
a = f"('{output.full_data}'!{to_xl_col(crit_employees.ref_col, data)}2:{to_xl_col(crit_employees.ref_col, data)}{len(data) + 1}>='{output.criteria}'!B1)" # logic gate for the employee csrd criteria
b = f"('{output.full_data}'!{to_xl_col(crit_revenue.ref_col, data)}2:{to_xl_col(crit_revenue.ref_col, data)}{len(data) + 1}>='{output.criteria}'!B2)" # logic gate for the revenue csrd criteria
c = f"('{output.full_data}'!{to_xl_col(crit_assets.ref_col, data)}2:{to_xl_col(crit_assets.ref_col, data)}{len(data) + 1}>='{output.criteria}'!B3)" # logic gate for the assets csrd criteria

office_logic_gate = {}
for office in offices:
    office_logic_gate[office.name] = f"(('{output.full_data}'!{to_xl_col(f'afstand tot {office.name}', data)}2:{to_xl_col(f'afstand tot {office.name}', data)}{len(data) + 1}<'{output.criteria}'!B4)*('{output.full_data}'!{to_xl_col(f'afstand tot {office.name}', data)}2:{to_xl_col(f'afstand tot {office.name}', data)}{len(data) + 1}>0))" # logic gate for the distance S&K criteria (includes a '>0' clause, since Excel regards empty cells as smaller than 0)

Fill in the office sheets with the formulas.

In [66]:
for office in offices:
    wb.create_sheet(office.name)
    wb[office.name]['A1'] = f"'{output.full_data}'!$A$1:${alph_dict[len(data.columns)]}$1"
    wb[office.name]['A2'] = f"FILTER('{output.full_data}'!$A$2:${alph_dict[len(data.columns)]}${len(data) + 1};(((({a}*({b}+{c}))+({b}*{c})))*{office_logic_gate[office.name]}))"

<span style="color:green">**NOTE: because of a support issue regarding Dynamic Arrays and openpyxl the above formulas cannot yet be entered as official formulas. Instead they are entered as text. Adding "=" before them in Excel enables the formulas (with Dynamic Arrays).**</span>


### Save the ouput file ###

In [67]:
wb.save(output.name)