# 2020 Stack Overflow Survey - Focus: Diversity

The following Notebook examines the 2020 Stack Overflow survey focussing on diversity using the CRISP-DM process. 

Questions:
1. What is the demographic setup of today's developer community? What profile is typical for a person writing code these days? 
2. How inclusive is the community? Do underrepresented groups feel equally welcome?
3. Are there differences regarding compensation? Is there a gender pay gap?

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Styling the visualizations
sns.set()
sns.set_style("whitegrid")

#Read in dataset
df = pd.read_csv('./survey_results_public.csv')
df.head()

### Examining original survey questions

In [None]:
#Adapt pandas default setting to display all survey questions in full length
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

#Read in schema to display survey questions
df_schema = pd.read_csv('./survey_results_schema.csv')
df_schema

In [None]:
df.shape

In [None]:
df.columns

### Missing values

In [None]:
#Adapt pandas default setting to display all results
pd.set_option('display.max_rows', None)

#Show missing values
df.isnull().mean().sort_values(ascending=False)

#### Finding: 
There are missing values in the relevant columns for examining diversity groups: 
Sexuality (0.317541), Age (0.294985), Ethnicity (0.287197), Trans(gender) (0.234498) and Gender (0.215696), however, as the notebook deals with descriptive statistics, no manipulation will be undertaken at this point.

## QUESTION 1: Demographic setup

### Gender

In [None]:
#Replace multiple answers and plot as pie chart
df['Gender'] = df['Gender'].dropna().replace(to_replace=r'(.*;.*)', value='Multiple answers', regex=True)
gender_counts = df['Gender'].value_counts()

gender_counts.plot.pie(
    autopct="%.1f%%", 
    explode=(0, 0.1, 0.1, 0.5), 
    figsize=(10, 10));

### Trans(gender)

In [None]:
#Count values and show as percentages
trans_counts = df.Trans.value_counts()
trans_counts/sum(trans_counts)*100

### Sexuality

In [None]:
#Replace multiple answers and plot as pie chart
df['Sexuality'] = df['Sexuality'].dropna().replace(to_replace=r'(.*;.*)', value='Multiple answers', regex=True)
sexuality_counts = df['Sexuality'].value_counts()

sexuality_counts.plot.pie(
    autopct="%.1f%%", 
    explode=(0, 0.1, 0.1, 0.1, 0.3), 
    figsize=(10, 10));

### Ethnicity

In [None]:
#Replace multiple answers and shorten long answer and plot as pie chart
df['Ethnicity'] = df['Ethnicity'].dropna().replace(to_replace=r'(.*;.*)', value='Multiple answers', regex=True)
df['Ethnicity'] = df['Ethnicity'].replace(to_replace=r'(Indigenous.*)', value='Indigeneous', regex=True)
ethnicity_counts = df['Ethnicity'].value_counts()

ethnicity_counts.plot.pie(
    autopct="%.1f%%", 
    explode=(0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5), 
    figsize=(10, 10));

### Age by Gender Group

In [None]:
#Create age buckets and gender groups
age_bins = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
age_men = df['Age'][df.Gender.eq("Man")].dropna()
age_women = df['Age'][df.Gender.eq("Woman")].dropna()

#Plot as labelled histogram
gender_label = ['Men', 'Women']
plt.hist([age_men, age_women], bins=age_bins, density=True, label=gender_label)
plt.legend();

In [None]:
#Calculate and print median age
median_age_women = round(age_women.median())
median_age_men = round(age_men.median())

print(f'The median age is: {median_age_women}-years old (women) and {median_age_men}-years old (men).')

## Average developer

In [None]:
#Calculate and print "stereotypical" features of developers, mode for categorical data, median for numerical data
avg_gender = df.Gender.mode()[0]
avg_age = round(df.Age.median())
avg_sexuality = df.Sexuality.mode()[0]
avg_ethnicity = df.Ethnicity.mode()[0]
avg_trans = df.Trans.mode()[0]

print(f'The average developer is a {avg_age}-year old, {avg_sexuality}, {avg_trans}-trans, {avg_ethnicity} {avg_gender}.')

## QUESTION 2: Inclusiveness of Stack Overflow community

In [None]:
#Add numbers to answers for sorting later
df['SOComm'] = df['SOComm'].dropna()
df['SOComm'] = df['SOComm'].replace(
    ['Neutral', 'No, not at all', 'No, not really', 'Not sure', 'Yes, definitely', 'Yes, somewhat'],
    ['3 - Neutral', '5 - No, not at all', '4 - No, not really', '6 - Not sure', '1 - Yes, definitely', '2 - Yes, somewhat'])

#Only include respondents with a Stack Overflow account
members_only = df[df.SOAccount.eq("Yes")].groupby(['Gender', 'SOComm'])['Respondent'].count()

#Group members by gender
members_gender = df[df.SOAccount.eq("Yes")].groupby(['Gender'])['Respondent'].count()

#Show percentages of responses based on gender
feeling_member = members_only.div(members_gender, level="Gender")
feeling_member

In [None]:
#Plot as horizontally stacked bar chart and show legend
feeling_member.unstack().plot.barh(stacked=True)
plt.legend(loc="center right");

## QUESTION 3: Salary

#### Salaries across countries

In [None]:
#Create compensation bins
comp_bins = [0, 20000, 40000, 60000, 80000, 100000, 120000, 140000, 160000, 180000, 200000, 220000, 240000]

#Filtering compensation by country
comp_usa = df['ConvertedComp'][df.Country.eq("United States")].dropna()
comp_uk = df['ConvertedComp'][df.Country.eq("United Kingdom")].dropna()
comp_india = df['ConvertedComp'][df.Country.eq("India")].dropna()

#Plotting data as histogram and show legend
country_label = ['USA', 'UK', 'India']
plt.hist([comp_usa, comp_uk, comp_india], bins=comp_bins, density=True, label=country_label);
plt.legend();

In [None]:
#Calculate and print median compensation by country
median_comp_usa = round(comp_usa.median())
median_comp_uk = round(comp_uk.median())
median_comp_india = round(comp_india.median())

print(f'The median compensation in USD is: ${median_comp_usa} (USA), ${median_comp_uk} (UK), ${median_comp_india} (India)')

### Salary by gender in United States

In [None]:
#Filter dataset for United States
usa = df[df.Country.eq("United States")]

#Create gender groups
comp_usa_men = usa['ConvertedComp'][df.Gender.eq("Man")].dropna()
comp_usa_women = usa['ConvertedComp'][df.Gender.eq("Woman")].dropna()

#Plot as labelled histogram and show legend
plt.hist([comp_usa_men, comp_usa_women], bins=comp_bins, density=True, label=gender_label)
plt.legend();

In [None]:
#Calculate median compensation and print
median_comp_usa_women = round(comp_usa_women.median())
median_comp_usa_men = round(comp_usa_men.median())

print(f'The median compensation in USD is: ${median_comp_usa_women} for women and ${median_comp_usa_men} for men.')

In [None]:
#Convert string values into numeric data and plot histogram
df['YearsCodePro'] = df['YearsCodePro'].replace(['Less than 1 year', 'More than 50 years'],['0', '51'])
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

df['YearsCodePro'].plot.hist();

In [None]:
df['YearsCodePro'].describe()

In [None]:
#Filter experience by gender group
code_men = df['YearsCodePro'][df.Gender.eq("Man")].dropna()
code_women = df['YearsCodePro'][df.Gender.eq("Woman")].dropna()

#Plot as labelled histogram and show legend
plt.hist([code_men, code_women], density=True, label=gender_label)
plt.legend();

In [None]:
#Calculate median professional coding experience
median_code_women = round(code_women.median())
median_code_men = round(code_men.median())

print(f'The median professional coding experience is: {median_code_women} years for women and {median_code_men} years for men.')

In [None]:
#Create experience bins and show counts to evaluate if group sizes are large enough
years_bins = [0, 2, 5, 10, 15, 51]

usa_years_gender = df.groupby(['Gender', pd.cut(df.YearsCodePro, years_bins)])
usa_years_gender.size().unstack()

In [None]:
#Cut coding experience into bins
df['ProCodingExperience'] = pd.cut(df['YearsCodePro'], years_bins)

#Remove groups with too low numbers
usa_men_women = df[~df['Gender'].isin(["Multiple answers", "Non-binary, genderqueer, or gender non-conforming"])]

#Create seaborn boxplot aand hide outliers to improve readability
sns.boxplot(
    x="ProCodingExperience", 
    y="ConvertedComp", 
    hue="Gender", 
    data=usa_men_women, 
    showfliers=False);