# FOCUSED Project: OSPO adoption

As part of the [FOCUSED Collaboration project](https://github.com/JumpsuitWizard/FOCUSED-Collaboration), this notebook examines OSPO (Open Source Programs Offices) adoption across the [Standard and Poor's 500 index](https://en.wikipedia.org/wiki/S%26P_500).

## Authors

- **PI**: Duane O'Brien
- **Researcher**: julia ferraioli
- **Analyst**: Reshama Shaikh

## Research question

## Methodology

## Data sources

The following data sources are used in the analysis:

- [S&P 500](https://github.com/datasets/s-and-p-500-companies/blob/master/data/constituents.csv)
- [OSCI Index](https://opensourceindex.io/)
- [OSPO Landscape](https://landscape.todogroup.org/)
- [OSPO++ Membership](https://ospoplusplus.org/about/members/)
- [OSPO Alliance Membership](https://ospo-alliance.org/membership/)


## Data fetching

Get the latest version of the data if the cached copy is more than 1 month old


In [None]:
from datetime import date
import pandas as pd
import pprint
import os
import re
import requests

pp = pprint.PrettyPrinter(indent=2)

# Set the raw output directory in YYYY/MM format
today = date.today()
data_dir = "data_raw/%s/%s/" % (today.year, ('%02d' % (today.month)))

# Dictionary of sources
sources = {"SP500": {"name": "S&P 500",
                     "link": "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/main/data/constituents.csv",
                     "format": "csv",
                     "data": None
                     },
           "OSCI": {"name": "OSCI",
                    # This is the base URL; needs to be fully qualified with year/month.json (ex /monthly/2023/07.json)
                    "link": "https://ststaticprodosciwebz2vmu.blob.core.windows.net/data/osci-ranking/monthly/%s/%s.json" % (today.year, ('%02d' % (today.month-1))),
                    "format": "json",
                    "data": None
                    },
           "TODO": {"name": "TODO Group Landscape",
                    "link": "https://landscape.todogroup.org/api/export?category=todo-group-member,ospo-adopter&project=&license=&organization=&headquarters=&company-type=&industries=&sort=name&grouping=no&bestpractices=&enduser=&parent=&language=&specification=&format=main",
                    "format": "csv",
                    "data": None
                    },
           "OSPOPlusPlus": {"name": "OSPO++",
                            "link": "https://raw.githubusercontent.com/ospoplusplus/ospoplusplus/main/content/about/members/_index.md",
                            "format": "md",
                            "data": None
                            },
           "OSPOAlliance": {"name": "OSPO Alliance",
                            "link": "https://gitlab.eclipse.org/eclipse/plato/www/-/raw/main/layouts/shortcodes/section-members.html",
                            "format": "html",
                            "data": None
                            }
           }

# Fetch and save the versioned data if it is out of date
for key, value in sources.items():
    # Check most recent data file
    filepath = os.path.join(data_dir, ("%s.%s" % (key, value['format'])))
    if os.path.isfile(filepath):
        print(value['name'], " is up-to-date")
    else:
        # Request the data
        try:
            req = requests.get(value['link'], stream=True)
            req.raise_for_status()
            if not os.path.exists(data_dir):
                os.makedirs(data_dir)
            with open(os.path.join(data_dir, ("%s.%s" % (key, value['format']))), "w") as f:
                f.write(req.text)
        except requests.exceptions.HTTPError as e:
            print("received %s; skipping." % (e))
            continue


## Data setup


### Create utility functions for parsing specific data sources


In [None]:
# Process OSPO++ members
def parse_opp(filepath):
    regex = r"(?<=company name\=)\"(.*?)\""
    with open(filepath, 'rt') as f:
        raw = f.read()
        return pd.DataFrame({'OSPO++ Member': re.findall(regex, raw)})

# Process OSPO Alliance members
def parse_alliance(filepath):
    name_re = r"(?<=alt\=)\"(.*?)(?= logo|\")"
    site_re = r"(?<=href\=)\"(.*?)(?=\")"
    with open(filepath, 'r') as f:
        raw = f.read()
        return pd.DataFrame({'OSPO Alliance Member': re.findall(name_re, raw), 'Website': re.findall(site_re, raw)})


### Load the data into Pandas


In [None]:
# Parse and process fetched data files
for key, value in sources.items():
    format = value['format']
    filepath = os.path.join(data_dir, ("%s.%s" % (key, format)))
    if os.path.isfile(filepath) is False:
        continue
    if format == 'csv':
        value['data'] = pd.read_csv(filepath)
    elif format == 'json':
        value['data'] = pd.read_json(filepath)
        value['data'] = pd.json_normalize(value['data']['data'])
    elif format == 'yml' or format == 'yaml':
        print("file %s has no currently implemented handler" % (key))
    elif format == 'md':
        if key == 'OSPOPlusPlus':
            value['data'] = parse_opp(filepath)
        else:
             print("file %s has no currently implemented handler" % (key))
    elif format == 'html':
        if key == 'OSPOAlliance':
            value['data'] = parse_alliance(filepath)
        else:
             print("file %s has no currently implemented handler" % (key))
    else:
        print("file format %s has no currently implemented handler" % (format))
            

### Preview the data


In [None]:
for key, value in sources.items():
    print(value['name'], "\n------")
    pp.pprint(value['data'].sample(5))
    print("\n")


### Clean up the data


In [None]:
# Rename columns in S&P 500 and add a few fields for comparison purposes
sources['SP500']['data'] = sources['SP500']['data'].rename(
    columns={'Security': 'company', 'GICS Sector': 'sector'})

sources['SP500']['data']['in S&P 500'] = True
sources['SP500']['data']['country'] = "United States"

# Reorder the columns for clarity's sake
order = ['company', 'sector', 'country', 'in S&P 500']

sources['SP500']['data'] = sources['SP500']['data'].reindex(order, axis=1)

# Filter out columns from OSCI that we don't need
keep_cols = ['company', 'position', 'industry']
sources['OSCI']['data'] = sources['OSCI']['data'].filter(keep_cols)
# print(df_osci.head())

# Rename columns in OSCI and add a field for comparison purposes
sources['OSCI']['data'] = sources['OSCI']['data'].rename(
    columns={'position': 'OSCI position', 'industry': 'OSCI sector'})
sources['OSCI']['data']['in OSCI'] = True

# Reorder the columns for clarity's sake
order = ['company', 'OSCI sector', 'OSCI position', 'in OSCI']

sources['OSCI']['data'] = sources['OSCI']['data'].reindex(order, axis=1)

# Rename columns in OSPO landscape
sources['TODO']['data'] = sources['TODO']['data'].rename(columns={
    'Name': 'TODO status',
    'Organization': 'company',
    'Market Cap': 'market cap',
    'Crunchbase Country': 'TODO country'
})

# Only keep certain columns in the data set
sources['TODO']['data'] = sources['TODO']['data'].filter(
    ['TODO status', 'company', 'market cap', 'TODO country'])

# Add a field for comparison purposes
sources['TODO']['data']['in TODO landscape'] = True

# Filter out those who have not adopted an OSPO
sources['TODO']['data'] = sources['TODO']['data'].loc[sources['TODO']['data']['TODO status'].str.contains(
    "adopter", case=False)]

# Reorder the columns for clarity's sake
order = ['company', 'TODO country', 'market cap',
         'TODO status', 'in TODO landscape']

sources['TODO']['data'] = sources['TODO']['data'].reindex(order, axis=1)

# Rename columns in OPSO++ landscape
sources['OSPOPlusPlus']['data'] = sources['OSPOPlusPlus']['data'].rename(columns={
    'OSPO++ Member': 'company'
})

# Add a field for comparison purposes
sources['OSPOPlusPlus']['data']['in OSPO++ landscape'] = True

# Rename columns in the OSPO Alliance landscape
sources['OSPOAlliance']['data'] = sources['OSPOAlliance']['data'].rename(columns={
    'OSPO Alliance Member': 'company', 'Website': 'website'
})

# Add a field for comparison purposes
sources['OSPOAlliance']['data']['in OSPO Alliance'] = True


### Preview the data


In [None]:
for key, value in sources.items():
    print(value['name'], " has ", len(value['data']), " items" "\n------")
    pp.pprint(value['data'].sample(5))
    print("\n")


### Merge the data sources


In [None]:
# First, merge S&P Index and OSCI
all_data = sources['SP500']['data'].merge(sources['OSCI']['data'], left_on='company',
                                          right_on='company', how='outer')

# Second, merge with TODO Group landscape
all_data = all_data.merge(sources['TODO']['data'],
                          left_on='company',
                          right_on='company',
                          how='outer')

# Third, merge with OSPO++ members
all_data = all_data.merge(sources['OSPOPlusPlus']['data'], left_on='company',
                          right_on='company', how='outer')

# Fourth, merge with OSPO Alliance members
all_data = all_data.merge(sources['OSPOAlliance']['data'], left_on='company',
                          right_on='company', how='outer')

# Prefer S&P data over OSCI and OSPO Landscape data
all_data['sector'] = all_data['sector'].mask(
    pd.isnull, all_data['OSCI sector'])
all_data['country'] = all_data['country'].mask(
    pd.isnull, all_data['TODO country'])
all_data = all_data.drop(['OSCI sector', 'TODO country'], axis=1)

# Update the column order to something logical
col_order = ['company', 'sector', 'country', 'website', 'market cap', 'in S&P 500', 'in TODO landscape', 'in OSCI',
             'in OSPO++ landscape', 'in OSPO Alliance', 'TODO status', 'OSCI position']

all_data = all_data.reindex(col_order, axis=1)
all_data.sort_values(by=['company']).head(10)


### Do some manual cleanup for known issues


In [None]:
# Google is split across two Alphabet stock options and Google so we'll merge them
googles = all_data.loc[all_data['company'].str.contains(
    'Alphabet|Google', case=False, regex=True)]
google = googles.groupby('country', as_index=False).last()

# Update the data set and drop the extraneous entries
all_data.set_index('company', inplace=True)
all_data.update(google.set_index('company'))
all_data.reset_index(inplace=True)
all_data[all_data['company'] == 'Google']
all_data.drop(googles.iloc[:2].index, inplace=True)


### Look at a larger sampling of the data


In [None]:
all_data.sample(25)


### Normalize the merged data


In [None]:
# Normalize the sectors across data sets

all_data['sector'].mask(all_data['sector'] ==
                        'Information Technology', 'Technology', inplace=True)
all_data['sector'].mask(all_data['sector'] == 'Health Care',
                        'Healthcare & Pharma', inplace=True)
all_data['sector'].mask(all_data['sector'] == 'Financials',
                        'Banking, Insurance & Financial Services ', inplace=True)

# Fill null values with default ones where needed
all_data = all_data.fillna(value={
    'in S&P 500': False,
    'in TODO landscape': False,
    'in OSCI': False,
    'OSCI position': 'na',
    'TODO status': 'na',
    'in OSPO++ landscape': False,
    'in OSPO Alliance': False,
    'market cap': 'unknown',
    'sector': 'unknown',
    'website': 'unknown',
    'country': 'unknown'
})


### Get a sneak peak at the data


In [None]:
data_sample = all_data.sample(20)
data_sample.sort_values(by=['company'])


## Save processed data to file


In [None]:
output_dir = "data_derived/%s/%s/" % (today.year, ('%02d' % (today.month)))
output_file = os.path.join(output_dir, "merged_data.csv")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

all_data.to_csv(output_file, index=False)


## Proceed to create visualizations

Head over to the [Visualizations notebook](Visualizations.ipynb) to generate some charts about the data.
