# FOCUSED Project: OSPO adoption

As part of the [FOCUSED Collaboration project](https://github.com/JumpsuitWizard/FOCUSED-Collaboration), this notebook examines OSPO (Open Source Programs Offices) adoption across the [Standard and Poor's 500 index](https://en.wikipedia.org/wiki/S%26P_500).

## Authors

- **PI**: Duane O'Brien
- **Researcher**: julia ferraioli
- **Analyst**: Reshama Shaikh

## Research question

## Methodology

## Data sources

The following data sources are used in the analysis:

- [S&P 500](https://github.com/datasets/s-and-p-500-companies/blob/master/data/constituents.csv)
- [OSCI Index](https://opensourceindex.io/)
- [OSPO Landscape](https://landscape.todogroup.org/)
- [OSPO++ Membership](https://ospoplusplus.org/about/members/)
- [OSPO Alliance Membership](https://ospo-alliance.org/membership/)


## Visualization setup

_Make sure you have run through the [Ingestion notebook](Ingestion.ipynb) first!_


In [None]:
from datetime import date
import numpy as np
import os
import pandas as pd
import plotly.express as px

# Load the raw data created from the ingestion notebook
today = date.today()
data_dir = "data_derived/%s/%s/" % (today.year, ('%02d' % (today.month)))
filename = 'merged_data.csv'

try:
    data = pd.read_csv(os.path.join(data_dir, filename))
except FileNotFoundError:
    print(os.path.join(data_dir, filename),
          " not found; have you run the Ingestion notebook?")

# Create tables for country and sector counts
country_counts = data.groupby(by=['country']).country.agg(
    'count').to_frame('total').reset_index()
sector_counts = data.groupby(by=['sector']).sector.agg(
    'count').to_frame('total').reset_index()

# Create an aggregate table
aggregates = pd.DataFrame({
    'categories':
    [
        'in S&P 500',
        'in TODO landscape',
        'in OSCI',
        'in OSPO++ landscape',
        'in OSPO Alliance',
        'in S&P and TODO landscape',
        'in S&P and OSCI',
        'in S&P and OSPO++ landscape',
        'in S&P and OSPO Alliance',
        'in TODO landscape and OSCI',
        'in TODO landscape and OSPO++ landscape',
        'in TODO landscape and OSPO Alliance',
        'in OSCI and OSPO++ landscape',
        'in OSCI and OSPO Alliance',
        'in OSPO++ and OSPO Alliance',
        'in TODO landscape, OSPO++ landscape, OR OSPO Alliance',
        'in all five'
    ],
    'count':
    [
        len(data[data['in S&P 500']]),
        len(data[data['in TODO landscape']]),
        len(data[data['in OSCI']]),
        len(data[data['in OSPO++ landscape']]),
        len(data[data['in OSPO Alliance']]),
        len(data.query('`in S&P 500` & `in TODO landscape`')),
        len(data.query('`in S&P 500` & `in OSCI`')),
        len(data.query('`in S&P 500` & `in OSPO++ landscape`')),
        len(data.query('`in S&P 500` & `in OSPO Alliance`')),
        len(data.query('`in TODO landscape` & `in OSCI`')),
        len(data.query('`in TODO landscape` & `in OSPO++ landscape`')),
        len(data.query('`in TODO landscape` & `in OSPO Alliance`')),
        len(data.query('`in OSCI` & `in OSPO++ landscape`')),
        len(data.query('`in OSCI` & `in OSPO Alliance`')),
        len(data.query('`in OSPO++ landscape` & `in OSPO Alliance`')),
        len(data.query('`in TODO landscape` | `in OSPO Alliance` | `in OSPO++ landscape`')),
        len(data.query(
            '`in S&P 500` & `in TODO landscape` & `in OSCI` & `in OSPO++ landscape` & `in OSPO Alliance`'))
    ]
})


## Look at the crossover between the various datasets


In [None]:
# Chart number of companies in each category and combination of categories
px.bar(aggregates, x='categories', y='count',
       title="Number of companies in each category",
       labels={'categories': 'dataset presence', 'count': '# of companies'}
       ).show()

# Chart number of companies by country
px.bar(country_counts, x='country', y='total',
       title="Breakdown of companies by country",
       labels={'total': '# of companies'}
       ).show()

# Chart number of companies by sector
px.bar(sector_counts, x='sector', y='total',
       title="Breakdown of companies by sector",
       labels={'total': '# of companies'}
       ).show()


## Examine data across various vectors

### OSPOs by country


In [None]:
# Country x TODO landscape
country_x_todo = (data.groupby(by=['country'], as_index=False)
                  .agg({'in TODO landscape': 'sum'}))
country_x_todo['not in TODO landscape'] = country_counts['total'] - \
    country_x_todo['in TODO landscape']

px.bar(country_x_todo, x='country',
       y=['in TODO landscape',
           'not in TODO landscape'
          ],
       title="country broken down by presence in TODO landscape"
       ).show()

# Country x OSPO Alliance
country_x_alliance = (data.groupby(by=['country'], as_index=False)
                      .agg({'in OSPO Alliance': 'sum'}))
# print(country_x_alliance)
country_x_alliance['not in OSPO Alliance'] = country_counts['total'] - \
    country_x_alliance['in OSPO Alliance']

px.bar(country_x_alliance, x='country',
       y=['in OSPO Alliance',
           'not in OSPO Alliance'
          ],
       title="country broken down by presence in OSPO Alliance"
       ).show()

# Country x OSPO++
country_x_plusplus = (data.groupby(by=['country'], as_index=False)
                      .agg({'in OSPO++ landscape': 'sum'}))
country_x_plusplus['not in OSPO++ landscape'] = country_counts['total'] - \
    country_x_plusplus['in OSPO++ landscape']

px.bar(country_x_plusplus, x='country',
       y=['in OSPO++ landscape',
           'not in OSPO++ landscape'
          ],
       title="country broken down by presence in OSPO++ landscape"
       ).show()

# Country x OSPOs (aggregate)
ospos = data.query(
    '`in TODO landscape` | `in OSPO Alliance` | `in OSPO++ landscape`')
country_x_ospo = ospos.groupby(by=['country'])[
    "company"].count().reset_index(name="has an OSPO")
country_x_ospo["does not have an OSPO"] = country_counts['total'] - \
    country_x_ospo['has an OSPO']

px.bar(country_x_ospo, x='country',
       y=['has an OSPO',
           'does not have an OSPO'
          ],
       title="country broken down by prevalence of OSPOs"
       ).show()


### OSPOs by sector


In [None]:
# Sector x TODO landscape
sector_x_todo = (data.groupby(by=['sector'], as_index=False)
                 .agg({'in TODO landscape': 'sum'}))
sector_x_todo['not in TODO landscape'] = sector_counts['total'] - \
    sector_x_todo['in TODO landscape']

px.bar(sector_x_todo, x='sector',
       y=['in TODO landscape',
           'not in TODO landscape'
          ], title="sector broken down by presence in TODO landscape"
       ).show()

# Sector x OSPO Alliance
sector_x_alliance = (data.groupby(by=['sector'], as_index=False)
                     .agg({'in OSPO Alliance': 'sum'}))
sector_x_alliance['not in OSPO Alliance'] = sector_counts['total'] - \
    sector_x_alliance['in OSPO Alliance']

px.bar(sector_x_alliance, x='sector',
       y=['in OSPO Alliance',
           'not in OSPO Alliance'
          ], title="sector broken down by presence in OSPO Alliance"
       ).show()

# Sector x OSPO++ landscape
sector_x_plusplus = (data.groupby(by=['sector'], as_index=False)
                     .agg({'in OSPO++ landscape': 'sum'}))
sector_x_plusplus['not in OSPO++ landscape'] = sector_counts['total'] - \
    sector_x_plusplus['in OSPO++ landscape']

px.bar(sector_x_plusplus, x='sector',
       y=['in OSPO++ landscape',
           'not in OSPO++ landscape'
          ], title="sector broken down by presence in OSPO++ landscape"
       ).show()

# Sector x OSPOs (aggregate)
ospos = data.query(
    '`in TODO landscape` | `in OSPO Alliance` | `in OSPO++ landscape`')
sector_x_ospo = ospos.groupby(by=['sector'])[
    "company"].count().reset_index(name="has an OSPO")
sector_x_ospo["does not have an OSPO"] = sector_counts['total'] - \
    country_x_ospo['has an OSPO']

px.bar(sector_x_ospo, x='sector',
       y=['has an OSPO',
           'does not have an OSPO'
          ],
       title="sector broken down by prevalence of OSPOs"
       ).show()


### OSPOs represented in OSCI

In [None]:
# OSCI x TODO landscape
tf_matrix_todo = data.groupby(['in OSCI', 'in TODO landscape']
                              ).size().unstack(fill_value=0)

px.imshow(np.flip(tf_matrix_todo.to_numpy(), 0),
          labels=dict(x="in TODO landscape", y="in OSCI",
                      color="# of companies"),
          x=['False', 'True'],
          y=['True', 'False'],
          title="OSCI cross-referenced with TODO landscape"
          ).show()

# OSCI x OSPO Alliance
tf_matrix_alliance = data.groupby(['in OSCI', 'in OSPO Alliance']
                                  ).size().unstack(fill_value=0)

px.imshow(np.flip(tf_matrix_alliance.to_numpy(), 0),
          labels=dict(x="in OSPO Alliance", y="in OSCI",
                      color="# of companies"),
          x=['False', 'True'],
          y=['True', 'False'],
          title="OSCI cross-referenced with OSPO Alliance"
          ).show()

# OSCI x OSPO++ landscape
tf_matrix_plusplus = data.groupby(['in OSCI', 'in OSPO++ landscape']
                                  ).size().unstack(fill_value=0)

px.imshow(np.flip(tf_matrix_plusplus.to_numpy(), 0),
          labels=dict(x="in OSPO++ landscape", y="in OSCI",
                      color="# of companies"),
          x=['False', 'True'],
          y=['True', 'False'],
          title="OSCI cross-referenced with OSPO++ landscape"
          ).show()

# OSCI x OSPOs (aggregate)
ospos = pd.DataFrame({'in OSCI': data['in OSCI'],
                      'has OSPO': data.apply(lambda x: True if (x['in TODO landscape'] | x['in OSPO Alliance'] | x['in OSPO++ landscape']) else False, axis=1),
                      'OSCI rank': data['OSCI position']})
tf_matrix_ospos = ospos.groupby(
    ['in OSCI', 'has OSPO']).size().unstack(fill_value=0)

px.imshow(np.flip(tf_matrix_ospos.to_numpy(), 0),
          labels=dict(x="has OSPO", y="in OSCI",
                      color="# of companies"),
          x=['False', 'True'],
          y=['True', 'False'],
          title="OSCI cross-referenced with companies that have OSPOs"
          ).show()


In [None]:
# Look at average OSCI rank for companies that have an OSPO versus those that do not
osci_x_ospos = pd.DataFrame({
    'has OSPO': ospos.query('`in OSCI`')['has OSPO'],
    'OSCI rank': ospos.query('`in OSCI`')['OSCI rank'].astype(float)
})

means = osci_x_ospos.groupby(['has OSPO'])['OSCI rank'].mean().reset_index()
px.bar(means, y='OSCI rank', x='has OSPO',
       title='Average OSCI rank for companies with and without an OSPO', color='OSCI rank').show()
