# Diversity of Categories

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading data
Necessary data are the dataframe created in the basic_exploration and the race_codes. The race codes contain information about the code of the race and its corresponding description/title.

In [2]:
data_path = r'../../data/cps_clean.csv'
race_occ_path = r'../../data/race_codes.txt'

In [3]:
data_df = pd.read_csv(data_path, sep=';')

In [4]:
race_codes_df = pd.read_csv(race_occ_path, sep='                ', header=None)
race_codes_df.columns = ['race_code', 'race_title']

  race_codes_df = pd.read_csv(race_occ_path, sep='                ', header=None)


## How diverse are the occupation categories in the year 2023?
Create a dataframe that only contains needed information. The occupation category instead of the exact occupation is used, because there are 423 different occupations available. Occupation categories are based on OCC2010. This column provides a more standardized representation of the occupation and occupation categories. Originally it was introduced to ensure comparability among several years.

In [5]:
data_df = data_df[data_df['YEAR'] == 2023]

In [6]:
data_df['OCC2010'].unique().size

423

In [33]:
race_occ_df = pd.DataFrame()
race_occ_df['race_code'] = data_df['RACE']
race_occ_df['occ_code'] = data_df['OCC2010']
race_occ_df['occ_category'] = data_df['Category']
race_occ_df['ASECWT'] = data_df['ASECWT']

In [34]:
race_occ_df = pd.merge(race_occ_df, race_codes_df, on='race_code')

In [35]:
race_occ_df['occ_category'] = race_occ_df['occ_category'].astype('category')
race_occ_df['race_title'] = race_occ_df['race_title'].astype('category')

### Diversity of entire dataset
As expected the following shows, that some races are overrepresented in the entire dataset. This is why it is important to incorporate the weight into the analysis to properly represent the actual distribution of races in the U.S.

In [36]:
race_counts = race_occ_df['race_title'].value_counts().rename_axis('race_title').reset_index(name='total_counts')
race_counts

Unnamed: 0,race_title,total_counts
0,White,50859
1,Black,7316
2,Asian only,4950
3,American Indian/Aleut/Eskimo,823
4,White-American Indian,436
5,Hawaiian/Pacific Islander only,397
6,White-Black,364
7,White-Asian,316
8,White-Hawaiian/Pacific Islander,64
9,Asian-Hawaiian/Pacific Islander,64


In [37]:
race_occ_df

Unnamed: 0,race_code,occ_code,occ_category,ASECWT,race_title
0,100,9350,TRANSPORTATION AND MATERIAL MOVING,1441.89,White
1,100,9350,TRANSPORTATION AND MATERIAL MOVING,2360.80,White
2,100,9350,TRANSPORTATION AND MATERIAL MOVING,5545.15,White
3,100,9350,TRANSPORTATION AND MATERIAL MOVING,969.37,White
4,100,9350,TRANSPORTATION AND MATERIAL MOVING,4065.48,White
...,...,...,...,...,...
65819,820,4230,BUILDING AND GROUNDS CLEANING AND MAINTENANCE,1970.09,"Two or three races, unspecified"
65820,820,9000,TRANSPORTATION AND MATERIAL MOVING,552.30,"Two or three races, unspecified"
65821,817,7000,"INSTALLATION, MAINTENANCE, AND REPAIR",1632.34,White-American Indian-Hawaiian/Pacific Islander
65822,817,7630,"INSTALLATION, MAINTENANCE, AND REPAIR",659.69,White-American Indian-Hawaiian/Pacific Islander


### Plot
For each occupation category plot the amount of people per race working in that field. It can be observed that several categories are more diverse than others. 
* Art, Design, Entertainment Sports and Media is very diverse. But maybe this is because this cateory combines many subcategories.
* Extraction + military are least diverse. They both contain only few jobs (4 and 5)
* Food preperation and Serving + Healthcare are the most diverse categories.

Absolute numbers

In [None]:
grouped_df = race_occ_df.groupby(['occ_category', 'race_title']).size().reset_index(name='count')
for occupation_category, group_df in grouped_df.groupby('occ_category'):
    # Pivot the DataFrame to have 'race_title' as columns and 'count' as values
    pivot_df = group_df.pivot(index='occ_category', columns='race_title', values='count').fillna(0)
    # only select those races which are present in the category
    pivot_df = pivot_df.loc[:, (pivot_df != 0).any(axis=0)]

    pivot_df = pivot_df.sort_values(by = f'{occupation_category}', axis = 1, ascending = False) 

    array = pivot_df.to_numpy()
    titles = list(pivot_df.columns)

    cmap = plt.cm.tab10
    colors = cmap(np.arange(len(titles)) % cmap.N)

    fig, ax = plt.subplots()
    bar_container = ax.bar(titles, array[0], color = colors)
    ax.set(ylabel='count', title=f'{occupation_category}')
    ax.bar_label(bar_container, fmt='{:,.0f}')
    ax.set_xticklabels(titles, rotation = 45, ha='right')

Percentage

In [None]:
grouped_df = race_occ_df.groupby(['occ_category', 'race_title']).size().reset_index(name='count')
for occupation_category, group_df in grouped_df.groupby('occ_category'):
    # Pivot the DataFrame to have 'race_title' as columns and 'count' as values
    pivot_df = group_df.pivot(index='occ_category', columns='race_title', values='count').fillna(0)
    # only select those races which are present in the category
    pivot_df = pivot_df.loc[:, (pivot_df != 0).any(axis=0)]

    pivot_df = pivot_df.sort_values(by = f'{occupation_category}', axis = 1, ascending = False) 

    array = pivot_df.to_numpy()
    array = array[0]
    titles = list(pivot_df.columns.str.strip())

    current_title = 0
    relative_array = []
    for count in array:
        relative_array.append(100 / race_counts[race_counts['race_title'].str.strip() == titles[current_title]]['total_counts'].item() * count)
        current_title += 1

    cmap = plt.cm.tab10
    colors = cmap(np.arange(len(titles)) % cmap.N)

    fig, ax = plt.subplots()
    bar_container = ax.bar(titles, relative_array, color = colors)
    ax.set(ylabel='Percentage of race', title=f'Percentage of people belonging to specific race working in {occupation_category}')
    ax.bar_label(bar_container, fmt=lambda x: f'{x:.2f}%')
    ax.set_xticklabels(titles, rotation = 45, ha='right')

### Other

In [None]:
race_occ_df[race_occ_df['race_title']==' Black-American Indian-Asian']

In [None]:
race_occ_df[race_occ_df['race_code'] == 300]

Unnamed: 0,race_code,occ_code,occ_category,race_title
63958,300,5610,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
63959,300,5610,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
63960,300,5610,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
63961,300,5610,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
63962,300,5610,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
...,...,...,...,...
64776,300,5165,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
64777,300,5165,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
64778,300,5165,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
64779,300,5165,OFFICE AND ADMINISTRATIVE SUPPORT,American Indian/Aleut/Eskimo
