In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the dataset as a dataframe
original_df = pd.read_csv('2017 Pew Research Center STEM survey.csv')
original_df.head()

Unnamed: 0.1,Unnamed: 0,CaseID,weight,WORK_1,WORK_2,WORK_3,WORK_4,EMPLOYED,FULLPART,SELFEMPLOYED,...,SCICOUR2_t,MATHCOUR2_t,PPT017_t,PPT18OV_t,PPHHSIZE_t,EDUC4CAT,RACE_col,RECONA_col,RECONB_col,RECONC_col
0,0,3.0,0.0707,1.0,2.0,2.0,2.0,1.0,1.0,2.0,...,1.0,1.0,0.0,1.0,1.0,3.0,1.0,1.0,,
1,1,4.0,0.3938,1.0,2.0,2.0,2.0,1.0,1.0,2.0,...,12.0,8.0,2.0,2.0,4.0,4.0,1.0,,,2.0
2,2,5.0,1.7321,2.0,1.0,2.0,2.0,1.0,2.0,2.0,...,98.0,98.0,0.0,2.0,2.0,3.0,3.0,,,
3,3,6.0,0.1478,1.0,2.0,2.0,1.0,1.0,1.0,1.0,...,15.0,6.0,0.0,2.0,2.0,4.0,1.0,,,
4,4,7.0,1.5842,2.0,2.0,2.0,1.0,1.0,2.0,1.0,...,3.0,3.0,0.0,2.0,2.0,4.0,3.0,,,


In [3]:
original_df.shape

(4914, 221)

In [4]:
# Show the dataframe columns
result = str(list(original_df.columns))
display(HTML(result))


In [5]:
# Rename columns
original_df.rename(columns={'IDEO': 'IDEOLOGY'}, inplace=True)
original_df.rename(columns={'PPGENDER': 'GENDER'}, inplace=True)
original_df.rename(columns={'PPETHM': 'ETHNICITY'}, inplace=True)
original_df.rename(columns={'RACE_col': 'RACE'}, inplace=True)
original_df.rename(columns={'FAMSTEM2_1': 'STEM_FAMILY_BACKGROUND'}, inplace=True)
original_df.rename(columns={'WORKTYPE_FINAL': 'STEM_PROFESSION'}, inplace=True)
original_df.rename(columns={'HH_INCOME_col': 'INCOME'}, inplace=True)
original_df.rename(columns={'PPREG4': 'REGION'}, inplace=True)
original_df.rename(columns={'ppagecat': 'AGE'}, inplace=True)
original_df.rename(columns={'ppagect4': 'AGE_REDUCED'}, inplace=True)
original_df.rename(columns={'EDUC4CAT': 'EDUCATION'}, inplace=True)
original_df.rename(columns={'RECONC_col': 'SCIENCE_AREA'}, inplace=True)

In [6]:
# Merge columns and replacing undefined values
original_df['PARTY'] = np.where(original_df['PARTY'].isin([3.,4.,]),
                                original_df['PARTYLN'],
                                original_df['PARTY'])

In [7]:
# Keep a numerical ideology column
original_df['CONSERVATIVE-LIBERAL'] = original_df['IDEOLOGY']

In [8]:
# Replace values
original_df['IDEOLOGY'] = original_df['IDEOLOGY'].replace({1.: 'Very conservative', 2.: 'Conservative',
                                                           3.: 'Moderate', 4.: 'Liberal', 5.: 'Very liberal'})
original_df['GENDER'] = original_df['GENDER'].replace({1.: 'male', 2.: 'female'})
original_df['ETHNICITY'] = original_df['ETHNICITY'].replace({1.: 'White', 2.: 'Black', 3.: 'Asian', 4.: 'Hispanic', 5.: '2+ races'})
original_df['RACE'] = original_df['RACE'].replace({1.: 'White', 2.: 'Black', 3.: 'Asian', 4.: 'Other'})
original_df['STEM_FAMILY_BACKGROUND'] = original_df['STEM_FAMILY_BACKGROUND'].fillna(0.).replace({0.: 'no', 1.: 'yes'})
original_df['STEM_PROFESSION'] = original_df['STEM_PROFESSION'].replace({1.: 'yes', 2.: 'no', 3: 'no'}) #3.: 'unemployed
original_df['PARTY'] = original_df['PARTY'].replace({1.: 'Republican', 2.: 'Democrat'})
original_df['INCOME'] = original_df['INCOME'].replace({1.: '<30,000', 2.: '30,000-49,999',
                                                       3.: '50,000-74,999', 4.: '75,000-99,999',
                                                       5.: '>=100,000'})
original_df['REGION'] = original_df['REGION'].replace({1.: 'Northeast', 2.: 'Midwest', 3.: 'South', 4.: 'West'})
original_df['AGE'] = original_df['AGE'].replace({1.: '18-24', 2.: '25-34', 3.: '35-44', 4.: '45-54', 5.: '55-64', 6.: '65-74', 7.: '75+'})
original_df['AGE_REDUCED'] = original_df['AGE_REDUCED'].replace({1.: '18-29', 2.: '30-44', 3.: '45-59', 4.: '60+'})
original_df['EDUCATION'] = original_df['EDUCATION'].replace({1.: 'High school graduate or less', 2.: 'Some college, including Associate degree',
                                                             3.: 'Bachelors degree', 4.: 'Masters, Professional or Doctorate degree'})
original_df['SCIENCE_AREA'] = original_df['SCIENCE_AREA'].replace({1.: 'Life sciences', 2.: 'Physical sciences', 3.: 'Social sciences/Other'})

In [9]:
# Create new ethnicity and race-related columns
original_df = original_df.dropna(subset=['ETHNICITY', 'RACE'])
original_df['ETHNICITY_GROUPED'] = np.where(original_df['ETHNICITY'] != 'White', 'Non-White', original_df['ETHNICITY'])
original_df['ETHNICITY_GROUPED_BLACK-HISPANICS'] = np.where(original_df['ETHNICITY'].isin(['White', 'Asian']), 'Black/Hispanic', 'Asian/White')
original_df['RACE_GROUPED'] = np.where(original_df['RACE'] != 'White', 'Non-White', original_df['RACE'])

In [10]:
original_df['ETHNICITY_GROUPED_BLACK-HISPANICS'].value_counts()

ETHNICITY_GROUPED_BLACK-HISPANICS
Black/Hispanic    3899
Asian/White       1015
Name: count, dtype: int64

In [11]:
# Filter ethnicity with value '2+ races'
original_df = original_df[original_df['ETHNICITY'] != '2+ races']

In [12]:
# Filter undefined ideology or political party
original_df_with_defined_political_affiliation = original_df[(original_df['IDEOLOGY'] != 9.0) & (original_df['PARTY'] != 9.0)]

### Dataframe question 1
Com es relaciona el nivell educatiu i els ingressos amb el gènere, l'etnicitat i l'afiliació política? Amb l'objectiu de determinar si el biaix representatiu patit a les àrees STEM és present també entre els enquestats que no pertanyen a aquestes professions i amb un alt poder d'adquisició.

In [13]:
# Generate a new dataframe with the appropriate questions from the original one
question1_df = original_df_with_defined_political_affiliation[['REGION',
                                                               'EDUCATION', 'INCOME',
                                                               'STEM_PROFESSION',
                                                               'GENDER',
                                                               'ETHNICITY', 'ETHNICITY_GROUPED', 'RACE', 'RACE_GROUPED',
                                                               'IDEOLOGY', 'PARTY']]

In [14]:
question1_df.head()

Unnamed: 0,REGION,EDUCATION,INCOME,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED,IDEOLOGY,PARTY
0,West,Bachelors degree,"75,000-99,999",yes,female,White,White,White,White,Very liberal,Democrat
1,West,"Masters, Professional or Doctorate degree",">=100,000",yes,female,White,White,White,White,Liberal,Democrat
2,West,Bachelors degree,">=100,000",no,female,Asian,Non-White,Asian,Non-White,Liberal,Democrat
3,South,"Masters, Professional or Doctorate degree",">=100,000",yes,female,White,White,White,White,Liberal,Democrat
4,West,"Masters, Professional or Doctorate degree",">=100,000",no,male,Asian,Non-White,Asian,Non-White,Liberal,Democrat


In [15]:
question1_df.shape

(4491, 11)

In [16]:
# Export to CSV file
question1_df.to_csv('question_1.csv', index=False)

### Dataframe question 2
Com es distribueixen les diferents titulacions STEM per gènere i grup ètnic? Per tal d'identificar quins grups de titulacions gaudeixen de més diversitat, així com les àrees de coneixement amb menor representativitat dels col·lectius minoritzats.

In [17]:
# Generate a new dataframe with the appropriate questions from the original one
question2_df = original_df[['OCCUPATION_col',
                            'REGION',
                            'STEM_PROFESSION', 'EDUCATION', 'INCOME',
                            'GENDER',
                            'ETHNICITY', 'ETHNICITY_GROUPED', 'RACE', 'RACE_GROUPED']]

In [18]:
# Rename columns
question2_df.rename(columns={'OCCUPATION_col': 'OCCUPATION'}, inplace=True)

In [19]:
# Replace values
'''
STEM occupations included the following job categories: computer and mathematical; architecture and engineering;
life and physical sciences; medical doctors; other health care practitioners; health care technologists and technicians;
and teachers (both K-12 and postscondary) who specialize in a STEM subject.
'''
question2_df['OCCUPATION'] = question2_df['OCCUPATION'].replace({1.: 'Life Management', 2.: 'Business and Financial Operations',
                                                                 3.: 'Computer and Mathematical', 4.: 'Architecture and Engineering',
                                                                 5.: 'Life, Physical, and Social Sciences', 6.: 'Community and Social Services',
                                                                 7.: 'Lawyer or Judge',
                                                                 8.: 'Teacher, except college and university', 9.: 'Teacher, college and university',
                                                                 10.: 'Other Professional',
                                                                 11.: 'Medical Doctor', 12.: 'Other Health Care Practitioner',
                                                                 13.: 'Health Technologist or Technician', 14.: 'Health Care Support',
                                                                 15.: 'Protective Service', 16.: 'Food Preparation and Serving',
                                                                 17.: 'Building and Grounds Cleaning and Maintenance', 18.: 'Personal Care and Service',
                                                                 19.: 'Sales Representative', 20.: 'Retail Sales', 21.: 'Other Sales',
                                                                 22.: 'Office and Administrative Support', 23.: 'Farming, Forestry, and Fishing',
                                                                 24.: 'Construction and Extraction', 25.: 'Installation, Maintenance, and Repair', 26.: 'Precision Production',
                                                                 27.: 'Transportation and Material Moving', 28.: 'Armed Forces',
                                                                 29.: 'Other'})

In [20]:
# Filter only STEM professionals
question2_df = question2_df[question2_df['STEM_PROFESSION'] == 'yes']

In [21]:
# Group all teaching-related occupations
question2_df['OCCUPATION'] = question2_df['OCCUPATION'].apply(lambda x: 'Teacher' if x in ['Teacher, college and university', 'Teacher, except college and university'] else x)
question2_df['OCCUPATION'].value_counts()

OCCUPATION
Other Health Care Practitioner         720
Computer and Mathematical              675
Architecture and Engineering           324
Health Technologist or Technician      243
Life, Physical, and Social Sciences    112
Teacher                                101
Medical Doctor                          98
Name: count, dtype: int64

In [22]:
question2_df.head()

Unnamed: 0,OCCUPATION,REGION,STEM_PROFESSION,EDUCATION,INCOME,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED
0,Computer and Mathematical,West,yes,Bachelors degree,"75,000-99,999",female,White,White,White,White
1,"Life, Physical, and Social Sciences",West,yes,"Masters, Professional or Doctorate degree",">=100,000",female,White,White,White,White
3,Teacher,South,yes,"Masters, Professional or Doctorate degree",">=100,000",female,White,White,White,White
6,Architecture and Engineering,Midwest,yes,"Some college, including Associate degree",">=100,000",male,White,White,White,White
7,Computer and Mathematical,West,yes,Bachelors degree,"75,000-99,999",male,White,White,White,White


In [23]:
question2_df.shape

(2273, 10)

In [24]:
# Export to CSV file
question2_df.to_csv('question_2.csv', index=False)

### Dataframe question 3
Quins són els motius que apropen i allunyen les persones a desenvolupar una carrera professional en STEM? Per a esbrinar quins són els factors que empenyen el jovent a decantar-se per una carrera STEM o allunyada en el sector. En aquest punt caldrà fer una distinció addicional per gènere i grup ètnic, per tal d'analitzar si gènere i etnicitat tenen alguna influència en aquest aspecte.

In [25]:
# Generate a new dataframe with the appropriate questions from the original one
question3_df = original_df[['SCH7', 'SCH8a', 'SCH8b', 'SCH9a', 'SCH9b',
                            'SCH10B_1', 'SCH10B_2', 'SCH10B_3', 'SCH10B_4', 'SCH10B_5',
                            'REGION',
                            'STEM_FAMILY_BACKGROUND', 'STEM_PROFESSION',
                            'GENDER',
                            'ETHNICITY', 'ETHNICITY_GROUPED', 'RACE', 'RACE_GROUPED']]

In [26]:
question3_df['SCH8a'].value_counts()

SCH8a
1.0    3791
2.0     945
9.0      19
Name: count, dtype: int64

In [27]:
# Rename columns
question3_df.rename(columns={'SCH7': 'reasons_to_dislike'}, inplace=True)
question3_df.rename(columns={'SCH8a': 'disliked_sience_classes'}, inplace=True)
question3_df.rename(columns={'SCH8b': 'disliked_math_classes'}, inplace=True)
question3_df.rename(columns={'SCH9a': 'science_dislike_reason'}, inplace=True)
question3_df.rename(columns={'SCH9b': 'maths_dislike_reason'}, inplace=True)
question3_df.rename(columns={'SCH10B_1': 'sciences_hard'}, inplace=True)
question3_df.rename(columns={'SCH10B_2': 'sciences_useless'}, inplace=True)
question3_df.rename(columns={'SCH10B_3': 'sciences_not_belonged'}, inplace=True)
question3_df.rename(columns={'SCH10B_4': 'sciences_disliked_labs_and_practice'}, inplace=True)
question3_df.rename(columns={'SCH10B_5': 'sciences_not_support_outside_school'}, inplace=True)

In [28]:
# Replace values
question3_df[['disliked_sience_classes',
              'disliked_math_classes']] = question3_df[['disliked_sience_classes',
                                                        'disliked_math_classes']].replace({1.: 'no', 2.: 'yes'})
question3_df['reasons_to_dislike'] = question3_df['reasons_to_dislike'].replace({1.: 'hard', 2.: 'boring',
                                                                                 3.: 'not useful', 4.: 'other'})
question3_df[['science_dislike_reason',
              'maths_dislike_reason']] = question3_df[['science_dislike_reason',
                                         'maths_dislike_reason']].replace({1.: 'teaching',
                                                                           2.: 'subject'})
question3_df[['sciences_hard',
              'sciences_useless',
              'sciences_not_belonged',
              'sciences_disliked_labs_and_practice',
              'sciences_not_support_outside_school']] = question3_df[['sciences_hard',
                                                                      'sciences_useless',
                                                                      'sciences_not_belonged',
                                                                      'sciences_disliked_labs_and_practice',
                                                                      'sciences_not_support_outside_school']].replace({0.: 'no',
                                                                                                                       1.: 'yes'})

In [29]:
# Create new column
question3_df['disliked_maths_and_sciences'] = np.where((question3_df['disliked_sience_classes'] == 'yes') &
                                                       (question3_df['disliked_math_classes'] == 'yes'), 'yes', 'no')

question3_df.head()

Unnamed: 0,reasons_to_dislike,disliked_sience_classes,disliked_math_classes,science_dislike_reason,maths_dislike_reason,sciences_hard,sciences_useless,sciences_not_belonged,sciences_disliked_labs_and_practice,sciences_not_support_outside_school,REGION,STEM_FAMILY_BACKGROUND,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED,disliked_maths_and_sciences
0,hard,yes,yes,subject,subject,yes,yes,yes,yes,yes,West,no,yes,female,White,White,White,White,yes
1,boring,no,no,subject,subject,,,,,,West,yes,yes,female,White,White,White,White,no
2,hard,no,no,subject,subject,,,,,,West,yes,no,female,Asian,Non-White,Asian,Non-White,no
3,hard,no,no,subject,subject,,,,,,South,no,yes,female,White,White,White,White,no
4,hard,no,no,subject,subject,,,,,,West,yes,no,male,Asian,Non-White,Asian,Non-White,no


In [30]:
# Filter those who liked science science classes
question3_df = question3_df[question3_df['disliked_sience_classes'] == 'yes']

In [31]:
question3_df.head()

Unnamed: 0,reasons_to_dislike,disliked_sience_classes,disliked_math_classes,science_dislike_reason,maths_dislike_reason,sciences_hard,sciences_useless,sciences_not_belonged,sciences_disliked_labs_and_practice,sciences_not_support_outside_school,REGION,STEM_FAMILY_BACKGROUND,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED,disliked_maths_and_sciences
0,hard,yes,yes,subject,subject,yes,yes,yes,yes,yes,West,no,yes,female,White,White,White,White,yes
8,not useful,yes,yes,teaching,subject,yes,no,no,no,no,Northeast,no,no,male,Black,Non-White,Black,Non-White,yes
23,boring,yes,yes,subject,teaching,yes,no,no,yes,no,West,no,no,male,Asian,Non-White,Asian,Non-White,yes
30,hard,yes,no,teaching,subject,no,no,no,no,no,South,yes,yes,female,White,White,White,White,no
34,hard,yes,yes,teaching,subject,no,yes,yes,yes,yes,South,no,yes,female,White,White,White,White,yes


In [32]:
question3_df.shape

(945, 19)

In [33]:
# Export to CSV file
question3_df.to_csv('question_3.csv', index=False)

In [34]:
# Set a new df with the total amount of responses for every reason to dislike Sciences
question3_reasons_dislike_sciences_hard =  question3_df['sciences_hard'][question3_df['sciences_hard'] == 'yes'].value_counts()
question3_reasons_dislike_sciences_not_belonged =  question3_df['sciences_not_belonged'][question3_df['sciences_not_belonged'] == 'yes'].value_counts()
question3_reasons_dislike_sciences_useless =  question3_df['sciences_useless'][question3_df['sciences_useless'] == 'yes'].value_counts()
question3_reasons_dislike_labs_and_practice =  question3_df['sciences_disliked_labs_and_practice'][question3_df['sciences_disliked_labs_and_practice'] == 'yes'].value_counts()
question3_reasons_dislike_sciences_not_support =  question3_df['sciences_not_support_outside_school'][question3_df['sciences_not_support_outside_school'] == 'yes'].value_counts()


question3_sciences_dislike_reasons = pd.concat([question3_reasons_dislike_sciences_hard,
                                                question3_reasons_dislike_sciences_not_belonged,
                                                question3_reasons_dislike_sciences_useless,
                                                question3_reasons_dislike_labs_and_practice,
                                                question3_reasons_dislike_sciences_not_support],
                                                axis=1,
                                                keys=['I found Sciences hard',
                                                      'I felt I did not belong in Sciences',
                                                      'I thought Sciences were useless',
                                                      'I disliked Sciences labs and hands-on learning', 
                                                      'I had no support for Sciences outside school']).transpose().reset_index()

question3_sciences_dislike_reasons = question3_sciences_dislike_reasons.rename(columns={'index': 'reason', 'yes': 'total'})
question3_sciences_dislike_reasons.head()

Unnamed: 0,reason,total
0,I found Sciences hard,439
1,I felt I did not belong in Sciences,196
2,I thought Sciences were useless,321
3,I disliked Sciences labs and hands-on learning,174
4,I had no support for Sciences outside school,187


In [35]:
# Export to CSV file
question3_sciences_dislike_reasons.to_csv('question_3_dislike_science_reasons.csv', index=False)

In [36]:
# Create the new DataFrame from the selected columns
question3_sciences_dislike_reasons_by_gender_and_ethnicity = question3_df.loc[:, ['GENDER', 'ETHNICITY',
                                                                                  'sciences_hard', 'sciences_not_belonged',
                                                                                  'sciences_useless', 'sciences_disliked_labs_and_practice',
                                                                                  'sciences_not_support_outside_school']]


# Select the desired columns to group by 'GENDER' and 'ETHNICITY'
selected_columns = ['sciences_hard', 'sciences_not_belonged', 'sciences_useless',
                    'sciences_disliked_labs_and_practice', 'sciences_not_support_outside_school']

In [37]:
# Select the desired columns and group by 'GENDER'
question3_sciences_dislike_reasons_by_gender = question3_sciences_dislike_reasons_by_gender_and_ethnicity.groupby(
                            'GENDER')[selected_columns].apply(lambda x: (x == 'yes').sum()).reset_index()
# Reshape the df
question3_sciences_dislike_reasons_by_gender_transposed = question3_sciences_dislike_reasons_by_gender.transpose()
question3_sciences_dislike_reasons_by_gender_transposed.columns = question3_sciences_dislike_reasons_by_gender_transposed.iloc[0]
question3_sciences_dislike_reasons_by_gender_transposed = question3_sciences_dislike_reasons_by_gender_transposed[1:]
question3_sciences_dislike_reasons_by_gender_transposed = question3_sciences_dislike_reasons_by_gender_transposed.reset_index().rename_axis('', axis="columns")
question3_sciences_dislike_reasons_by_gender_transposed = question3_sciences_dislike_reasons_by_gender_transposed.rename(
                                                                columns={question3_sciences_dislike_reasons_by_gender_transposed.columns[0]: 'Reason to dislike Sciences'})
question3_sciences_dislike_reasons_by_gender_reshaped = pd.melt(question3_sciences_dislike_reasons_by_gender_transposed,
                                                                                id_vars='Reason to dislike Sciences',
                                                                                var_name = 'Gender',
                                                                                value_name = 'Count')
question3_sciences_dislike_reasons_by_gender_reshaped.head(10)

Unnamed: 0,Reason to dislike Sciences,Gender,Count
0,sciences_hard,female,285
1,sciences_not_belonged,female,119
2,sciences_useless,female,202
3,sciences_disliked_labs_and_practice,female,114
4,sciences_not_support_outside_school,female,117
5,sciences_hard,male,154
6,sciences_not_belonged,male,77
7,sciences_useless,male,119
8,sciences_disliked_labs_and_practice,male,60
9,sciences_not_support_outside_school,male,70


In [38]:
# Export to CSV file
question3_sciences_dislike_reasons_by_gender_reshaped.to_csv('question_3_sciences_dislike_reasons_by_gender_reshaped.csv', index=False)

In [39]:
# Export to CSV file
question3_sciences_dislike_reasons_by_gender.to_csv('question_3_dislike_science_reasons_by_gender.csv', index=False)

In [40]:
# Select the desired columns and group by 'ETHNICITY'
question3_sciences_dislike_reasons_by_ethnicity = question3_sciences_dislike_reasons_by_gender_and_ethnicity.groupby(
                            'ETHNICITY')[selected_columns].apply(lambda x: (x == 'yes').sum()).reset_index()
# Reshape the df
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity.transpose()
question3_sciences_dislike_reasons_by_ethnicity_transposed.columns = question3_sciences_dislike_reasons_by_ethnicity_transposed.iloc[0]
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed[1:]
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed.reset_index().rename_axis('', axis="columns")
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed.rename(
                                                                columns={question3_sciences_dislike_reasons_by_ethnicity_transposed.columns[0]: 'Reason to dislike Sciences'})
question3_sciences_dislike_reasons_by_ethnicity_reshaped = pd.melt(question3_sciences_dislike_reasons_by_ethnicity_transposed,
                                                                                id_vars='Reason to dislike Sciences',
                                                                                var_name = 'Ethnicity',
                                                                                value_name = 'Count')
question3_sciences_dislike_reasons_by_ethnicity_reshaped.head(10)

Unnamed: 0,Reason to dislike Sciences,Ethnicity,Count
0,sciences_hard,Asian,13
1,sciences_not_belonged,Asian,5
2,sciences_useless,Asian,12
3,sciences_disliked_labs_and_practice,Asian,4
4,sciences_not_support_outside_school,Asian,8
5,sciences_hard,Black,41
6,sciences_not_belonged,Black,12
7,sciences_useless,Black,20
8,sciences_disliked_labs_and_practice,Black,16
9,sciences_not_support_outside_school,Black,17


In [41]:
# Export to CSV file
question3_sciences_dislike_reasons_by_ethnicity_reshaped.to_csv('question_3_sciences_dislike_reasons_by_ethnicity_reshaped.csv', index=False)

In [42]:
question3_sciences_dislike_reasons_by_ethnicity = question3_sciences_dislike_reasons_by_gender_and_ethnicity.groupby(
                            'ETHNICITY')[selected_columns].apply(lambda x: (x == 'yes').sum()).reset_index()
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity.transpose()
question3_sciences_dislike_reasons_by_ethnicity_transposed.columns = question3_sciences_dislike_reasons_by_ethnicity_transposed.iloc[0]
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed[1:]
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed.reset_index().rename_axis('', axis="columns")
question3_sciences_dislike_reasons_by_ethnicity_transposed = question3_sciences_dislike_reasons_by_ethnicity_transposed.rename(
                                                                columns={question3_sciences_dislike_reasons_by_ethnicity_transposed.columns[0]: 'Reason to dislike Sciences'})
question3_sciences_dislike_reasons_by_ethnicity_reshaped = pd.melt(question3_sciences_dislike_reasons_by_ethnicity_transposed,
                                                                                id_vars='Reason to dislike Sciences',
                                                                                var_name = 'Ethnicity',
                                                                                value_name = 'Count')
question3_sciences_dislike_reasons_by_ethnicity_reshaped.head(10)

Unnamed: 0,Reason to dislike Sciences,Ethnicity,Count
0,sciences_hard,Asian,13
1,sciences_not_belonged,Asian,5
2,sciences_useless,Asian,12
3,sciences_disliked_labs_and_practice,Asian,4
4,sciences_not_support_outside_school,Asian,8
5,sciences_hard,Black,41
6,sciences_not_belonged,Black,12
7,sciences_useless,Black,20
8,sciences_disliked_labs_and_practice,Black,16
9,sciences_not_support_outside_school,Black,17


### Dataframe question 4
Quina és la relació entre el gènere, el grup ètnic i les condicions laborals dels professionals STEM? Amb l'objectiu d'esbrinar si, entre aquells que es decanten per aquestes professions, el seu gènere i etnicitat tenen algun impacte, tant en el seu salari o la valoració rebuda al lloc de treball, com en el grau de satisfacció laboral dels individus.

In [43]:
# Generate a new dataframe with the appropriate questions from the original one
question4_df = original_df[['PPCM0166',
                            'TALENT', 'PROVE', 'RESPECTA', 'RESPECTB', 'AHEADf', 
                            'ETHN1',
                            'REGION',
                            'INCOME',  
                            'STEM_PROFESSION',
                            'GENDER',
                            'ETHNICITY', 'ETHNICITY_GROUPED', 'RACE', 'RACE_GROUPED']]

In [44]:
# Filter non-defined values
question4_df = question4_df[~(question4_df == 9.0).any(axis=1)]
question4_df = question4_df.dropna()

In [45]:
# Rename columns
question4_df.rename(columns={'PPCM0166': 'workplace_size'}, inplace=True)
question4_df.rename(columns={'TALENT': 'importance_of_talent'}, inplace=True)
question4_df.rename(columns={'PROVE': 'need_to_prove_oneself_to_be_respected'}, inplace=True)
question4_df.rename(columns={'RESPECTA': 'contributions_valued_by_supervisor'}, inplace=True)
question4_df.rename(columns={'RESPECTB': 'contributions_valued_by_co-workers'}, inplace=True)
question4_df.rename(columns={'AHEADf': 'working_harder_gets_oneself_ahead_at_workplace'}, inplace=True)
question4_df.rename(columns={'ETHN1': 'majority_ethnicity_at_workplace'}, inplace=True)

In [46]:
# Create new column with inverted meaning
question4_df['respected_without_needing_to_prove_oneself'] = question4_df['need_to_prove_oneself_to_be_respected']

In [47]:
# Replace values
question4_df['workplace_size'] = question4_df['workplace_size'].replace({1.: '<10', 2.: '10-24', 3.: '25-49', 4.: '50-99',
                                                                         5.: '100-499', 6.: '500-999', 7.: '>= 1000'})
question4_df['majority_ethnicity_at_workplace'] = question4_df['majority_ethnicity_at_workplace'].replace({1.: 'same as mine',
                                                                                                           2.: 'different than mine',
                                                                                                           3.: 'mixed ethnicity'})
question4_df['importance_of_talent'] = question4_df['importance_of_talent'].replace({1.: 4., 2.: 3.,
                                                                                     3.: 2., 4.:1.})
question4_df[['contributions_valued_by_supervisor',
              'contributions_valued_by_co-workers']] =  question4_df[['contributions_valued_by_supervisor',
                                                                      'contributions_valued_by_co-workers']].replace({1.: 4.,
                                                                                                                      2.: 3.,
                                                                                                                      3.: 2.,
                                                                                                                      4.: 1.})
question4_df['working_harder_gets_oneself_ahead_at_workplace'] = question4_df['working_harder_gets_oneself_ahead_at_workplace'].replace({1.: 3.,
                                                                                                                                         2.: 1.,
                                                                                                                                         3.: 2.})

In [48]:
# Filter those with workplace size under 10 or undefined
question4_df = question4_df[(question4_df['workplace_size'] != 8.) & (question4_df['workplace_size'] != '<10')]

In [49]:
question_4_numeric_values = question4_df.copy()
question_4_numeric_values.head()

Unnamed: 0,workplace_size,importance_of_talent,need_to_prove_oneself_to_be_respected,contributions_valued_by_supervisor,contributions_valued_by_co-workers,working_harder_gets_oneself_ahead_at_workplace,majority_ethnicity_at_workplace,REGION,INCOME,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED,respected_without_needing_to_prove_oneself
0,>= 1000,4.0,4.0,3.0,3.0,3.0,mixed ethnicity,West,"75,000-99,999",yes,female,White,White,White,White,4.0
1,>= 1000,3.0,3.0,3.0,3.0,3.0,mixed ethnicity,West,">=100,000",yes,female,White,White,White,White,3.0
3,>= 1000,4.0,3.0,4.0,4.0,3.0,mixed ethnicity,South,">=100,000",yes,female,White,White,White,White,3.0
6,>= 1000,4.0,4.0,3.0,4.0,3.0,mixed ethnicity,Midwest,">=100,000",yes,male,White,White,White,White,4.0
7,>= 1000,2.0,3.0,3.0,3.0,3.0,same as mine,West,"75,000-99,999",yes,male,White,White,White,White,3.0


In [50]:
# Export to CSV file
question_4_numeric_values.to_csv('question_4_numeric_values.csv', index=False)

In [51]:

question4_df['importance_of_talent'] = question4_df['importance_of_talent'].replace({4.: 'very important', 3.: 'somewhat important',
                                                                                     2.: 'not too important', 1.: 'not important'})
question4_df['respected_without_needing_to_prove_oneself'] = question4_df[
             'respected_without_needing_to_prove_oneself'].replace({1.: 'never', 2.: 'not too often',
                                                                    3.: 'some of the time', 4.: 'all the time'})
question4_df[['contributions_valued_by_supervisor',
              'contributions_valued_by_co-workers']] =  question4_df[['contributions_valued_by_supervisor',
                                                                      'contributions_valued_by_co-workers']].replace({1.: 'not at all',
                                                                                                                      2.: 'not too much',
                                                                                                                      3.: 'some',
                                                                                                                      4.: 'a lot'})
question4_df['working_harder_gets_oneself_ahead_at_workplace'] = question4_df['working_harder_gets_oneself_ahead_at_workplace'].replace({1.: 'hurts',
                                                                                                                                         2.: 'no difference',
                                                                                                                                         3.: 'helps'})



In [52]:
question4_df.head()

Unnamed: 0,workplace_size,importance_of_talent,need_to_prove_oneself_to_be_respected,contributions_valued_by_supervisor,contributions_valued_by_co-workers,working_harder_gets_oneself_ahead_at_workplace,majority_ethnicity_at_workplace,REGION,INCOME,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,RACE,RACE_GROUPED,respected_without_needing_to_prove_oneself
0,>= 1000,very important,4.0,some,some,helps,mixed ethnicity,West,"75,000-99,999",yes,female,White,White,White,White,all the time
1,>= 1000,somewhat important,3.0,some,some,helps,mixed ethnicity,West,">=100,000",yes,female,White,White,White,White,some of the time
3,>= 1000,very important,3.0,a lot,a lot,helps,mixed ethnicity,South,">=100,000",yes,female,White,White,White,White,some of the time
6,>= 1000,very important,4.0,some,a lot,helps,mixed ethnicity,Midwest,">=100,000",yes,male,White,White,White,White,all the time
7,>= 1000,not too important,3.0,some,some,helps,same as mine,West,"75,000-99,999",yes,male,White,White,White,White,some of the time


In [53]:
question4_df.shape

(3627, 16)

In [54]:
# Export to CSV file
question4_df.to_csv('question_4.csv', index=False)

In [55]:
# Get measure of contributions_valued_by_supervisor and contributions_valued_by_co-workers
supervisor_counts = question_4_numeric_values.groupby('ETHNICITY')['contributions_valued_by_supervisor'].value_counts()
coworkers_counts = question_4_numeric_values.groupby('ETHNICITY')['contributions_valued_by_co-workers'].value_counts()
question4_df_contribution_measures = pd.concat([supervisor_counts, coworkers_counts], axis=1, keys=['contributions_valued_by_supervisor',
                                                                                       'contributions_valued_by_co-workers'])
question4_df_contribution_measures = question4_df_contribution_measures.reset_index()
question4_df_contribution_measures = question4_df_contribution_measures.rename(columns={'level_1': 'score'})
question4_df_contribution_measures.head(10)


Unnamed: 0,ETHNICITY,score,contributions_valued_by_supervisor,contributions_valued_by_co-workers
0,Asian,4.0,89,79
1,Asian,3.0,86,99
2,Asian,2.0,19,18
3,Asian,1.0,7,5
4,Black,4.0,114,100
5,Black,3.0,107,121
6,Black,2.0,24,27
7,Black,1.0,17,14
8,Hispanic,4.0,171,146
9,Hispanic,3.0,143,162


In [56]:
# Turn into %
total_supervisor_counts = question4_df_contribution_measures.groupby(['ETHNICITY', 'score'])['contributions_valued_by_supervisor'].sum()
total_coworkers_counts = question4_df_contribution_measures.groupby(['ETHNICITY', 'score'])['contributions_valued_by_co-workers'].sum()
total_counts = question4_df_contribution_measures.groupby(['ETHNICITY'])['contributions_valued_by_co-workers'].sum()
supervisor_percentage = np.round(total_supervisor_counts / total_counts, 3)
coworkers_percentage = np.round(total_coworkers_counts / total_counts, 3)
question4_df_contribution_measures_percents = pd.DataFrame({
    'ETHNICITY': supervisor_percentage.index.get_level_values('ETHNICITY'),
    'score': supervisor_percentage.index.get_level_values('score'),
    'contributions_valued_by_supervisor_percent': supervisor_percentage.values,
    'contributions_valued_by_co-workers_percent': coworkers_percentage.values
})

question4_df_contribution_measures = pd.merge(question4_df_contribution_measures,
                                              question4_df_contribution_measures_percents,
                                              on=['ETHNICITY', 'score'])

question4_df_contribution_measures.head(10)


Unnamed: 0,ETHNICITY,score,contributions_valued_by_supervisor,contributions_valued_by_co-workers,contributions_valued_by_supervisor_percent,contributions_valued_by_co-workers_percent
0,Asian,4.0,89,79,0.443,0.393
1,Asian,3.0,86,99,0.428,0.493
2,Asian,2.0,19,18,0.095,0.09
3,Asian,1.0,7,5,0.035,0.025
4,Black,4.0,114,100,0.435,0.382
5,Black,3.0,107,121,0.408,0.462
6,Black,2.0,24,27,0.092,0.103
7,Black,1.0,17,14,0.065,0.053
8,Hispanic,4.0,171,146,0.441,0.376
9,Hispanic,3.0,143,162,0.369,0.418


In [57]:
# Export to CSV file
question4_df_contribution_measures.to_csv('question_4_contribution_measures.csv', index=False)

### Dataframe question 5
Com afecten la pertinença a un col·lectiu minoritzat i l'afiliació política a la sensibilització envers la manca de diversitat i la discriminació en el sector? Per tractar de determinar si el conjunt de la ciutadania, pertanyen o no a col·lectius minoritzats, comparteix impressions i l'objectiu de posar fi a la infrarepresentació, o si pel contrari la ideologia política i/o la pertinença a col·lectius privilegiats tenen un efecte determinant sobre la preocupació i el compromís envers aquests temes.

In [58]:
# Generate a new dataframe with the appropriate questions from the original one
question5_df = original_df_with_defined_political_affiliation[['REASON1a', 'REASON1d', 'REASON1f', 'REASON1g',
                                                               'REASON2a', 'REASON2d', 'REASON2f', 'REASON2g',
                                                               'TECH3', 'TECH6',
                                                               'GEND5', 'ETHN5',
                                                               'GENDJOB1', 'ETHNJOB1', 'GENDDISC_i', 'ETHNDISC_i',
                                                               'REGION',
                                                               'STEM_PROFESSION',
                                                               'GENDER', 
                                                               'ETHNICITY', 'ETHNICITY_GROUPED', 'ETHNICITY_GROUPED_BLACK-HISPANICS',
                                                               'RACE', 'RACE_GROUPED',
                                                               'IDEOLOGY', 'CONSERVATIVE-LIBERAL', 'PARTY']]

In [59]:
# Filter non-defined values
question5_df = question5_df[~(question5_df == 9.0).any(axis=1)]
question5_df = question5_df.dropna()

In [60]:
# Rename columns
question5_df.rename(columns={'REASON1a': 'woman_not_encouraged_to_STEM'}, inplace=True)
question5_df.rename(columns={'REASON1d': 'woman_face_discrimination_hiring_in_STEM'}, inplace=True)
question5_df.rename(columns={'REASON1f': 'woman_less_interested_in_STEM'}, inplace=True)
question5_df.rename(columns={'REASON1g': 'woman_poor_family_conciliation_in_STEM'}, inplace=True)

question5_df.rename(columns={'REASON2a': 'black_and_hispanics_not_encouraged_to_STEM'}, inplace=True)
question5_df.rename(columns={'REASON2d': 'black_and_hispanics_face_discrimination_hiring_in_STEM'}, inplace=True)
question5_df.rename(columns={'REASON2f': 'black_and_hispanics_less_interested_in_STEM'}, inplace=True)
question5_df.rename(columns={'REASON2g': 'black_and_hispanics_poor_access_to_quality_education'}, inplace=True)

question5_df.rename(columns={'TECH3': 'women_discrimination_size_problem_in_STEM'}, inplace=True)
question5_df.rename(columns={'TECH6': 'black_and_hispanics_discrimination_size_problem_in_STEM'}, inplace=True)

question5_df.rename(columns={'GEND5': 'importance_of_gender_diversity'}, inplace=True)
question5_df.rename(columns={'ETHN5': 'importance_of_ethnic_diversity'}, inplace=True)

question5_df.rename(columns={'GENDJOB1': 'influence_of_gender_in_own_personal_success'}, inplace=True)
question5_df.rename(columns={'ETHNJOB1': 'influence_of_ethnicity_in_own_personal_success'}, inplace=True)
question5_df.rename(columns={'GENDDISC_i': 'experienced_gender_discrimination'}, inplace=True)
question5_df.rename(columns={'ETHNDISC_i': 'experienced_race_discrimination'}, inplace=True)

In [61]:
# Replace values
question5_df[['experienced_gender_discrimination',
              'experienced_race_discrimination']] =  question5_df[['experienced_gender_discrimination',
                                                                    'experienced_race_discrimination']].replace({0.: 'yes', 1.: 'no'})
question5_df[['influence_of_gender_in_own_personal_success',
              'influence_of_ethnicity_in_own_personal_success']] =  question5_df[['influence_of_gender_in_own_personal_success',
                                                                                  'influence_of_ethnicity_in_own_personal_success'
                                                                                   ]].replace({1.: 'influence',
                                                                                               2.: 'influence',
                                                                                               3.: 'no influence'})

In [62]:
# Make a copy for numerical values
question5_df_numeric_values = question5_df.copy()

In [63]:
# Replace values
question5_df_numeric_values[['woman_not_encouraged_to_STEM',
              'woman_face_discrimination_hiring_in_STEM',
              'woman_less_interested_in_STEM',
              'woman_poor_family_conciliation_in_STEM',
              'black_and_hispanics_not_encouraged_to_STEM',
              'black_and_hispanics_face_discrimination_hiring_in_STEM',
              'black_and_hispanics_less_interested_in_STEM',
              'black_and_hispanics_poor_access_to_quality_education']] = question5_df_numeric_values[['woman_not_encouraged_to_STEM',
                                                                            'woman_face_discrimination_hiring_in_STEM',
                                                                            'woman_less_interested_in_STEM',
                                                                            'woman_poor_family_conciliation_in_STEM',
                                                                            'black_and_hispanics_not_encouraged_to_STEM',
                                                                            'black_and_hispanics_face_discrimination_hiring_in_STEM',
                                                                            'black_and_hispanics_less_interested_in_STEM',
                                                                            'black_and_hispanics_poor_access_to_quality_education']].replace({1.: 3.,
                                                                                                                                              2.: 2.,
                                                                                                                                              3.: 1.})
question5_df_numeric_values[['importance_of_gender_diversity',
              'importance_of_ethnic_diversity']] =  question5_df_numeric_values[['importance_of_gender_diversity',
                                                                                 'importance_of_ethnic_diversity']].replace({1.: 5.,
                                                                                                                             2.: 4.,
                                                                                                                             3.: 3.,
                                                                                                                             4.: 2.,
                                                                                                                             5.: 1.})
question5_df_numeric_values[['women_discrimination_size_problem_in_STEM',
              'black_and_hispanics_discrimination_size_problem_in_STEM']] =  question5_df_numeric_values[['women_discrimination_size_problem_in_STEM',
                                                                                           'black_and_hispanics_discrimination_size_problem_in_STEM'
                                                                                            ]].replace({1.: 'major problem',
                                                                                                        2.: 'minor problem',
                                                                                                        3.: 'no problem'})

In [64]:
# Normalize values over 10
question5_df_numeric_values['CONSERVATIVE-LIBERAL'] = question5_df_numeric_values['CONSERVATIVE-LIBERAL']/5 * 10
question5_df_numeric_values[['woman_not_encouraged_to_STEM',
              'woman_face_discrimination_hiring_in_STEM',
              'woman_less_interested_in_STEM',
              'woman_poor_family_conciliation_in_STEM',
              'black_and_hispanics_not_encouraged_to_STEM',
              'black_and_hispanics_face_discrimination_hiring_in_STEM',
              'black_and_hispanics_less_interested_in_STEM',
              'black_and_hispanics_poor_access_to_quality_education']] = question5_df_numeric_values[['woman_not_encouraged_to_STEM',
                                                                            'woman_face_discrimination_hiring_in_STEM',
                                                                            'woman_less_interested_in_STEM',
                                                                            'woman_poor_family_conciliation_in_STEM',
                                                                            'black_and_hispanics_not_encouraged_to_STEM',
                                                                            'black_and_hispanics_face_discrimination_hiring_in_STEM',
                                                                            'black_and_hispanics_less_interested_in_STEM',
                                                                            'black_and_hispanics_poor_access_to_quality_education']]/3 * 10
question5_df_numeric_values[['importance_of_gender_diversity',
                             'importance_of_ethnic_diversity']] = question5_df_numeric_values[[
                                                                            'importance_of_gender_diversity',
                                                                            'importance_of_ethnic_diversity']]/5 * 10
question5_df[['experienced_gender_discrimination',
              'experienced_race_discrimination']] =  question5_df[['experienced_gender_discrimination',
                                                                    'experienced_race_discrimination']].replace({0.: 'yes', 1.: 'no'})

In [65]:
question5_df_numeric_values.head()

Unnamed: 0,woman_not_encouraged_to_STEM,woman_face_discrimination_hiring_in_STEM,woman_less_interested_in_STEM,woman_poor_family_conciliation_in_STEM,black_and_hispanics_not_encouraged_to_STEM,black_and_hispanics_face_discrimination_hiring_in_STEM,black_and_hispanics_less_interested_in_STEM,black_and_hispanics_poor_access_to_quality_education,women_discrimination_size_problem_in_STEM,black_and_hispanics_discrimination_size_problem_in_STEM,...,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,ETHNICITY_GROUPED_BLACK-HISPANICS,RACE,RACE_GROUPED,IDEOLOGY,CONSERVATIVE-LIBERAL,PARTY
0,10.0,10.0,3.333333,10.0,10.0,10.0,3.333333,10.0,major problem,major problem,...,yes,female,White,White,Black/Hispanic,White,White,Very liberal,10.0,Democrat
1,6.666667,6.666667,3.333333,10.0,10.0,10.0,3.333333,10.0,major problem,minor problem,...,yes,female,White,White,Black/Hispanic,White,White,Liberal,8.0,Democrat
2,6.666667,6.666667,3.333333,10.0,6.666667,10.0,3.333333,10.0,major problem,major problem,...,no,female,Asian,Non-White,Black/Hispanic,Asian,Non-White,Liberal,8.0,Democrat
3,6.666667,6.666667,3.333333,10.0,10.0,6.666667,6.666667,10.0,major problem,minor problem,...,yes,female,White,White,Black/Hispanic,White,White,Liberal,8.0,Democrat
4,10.0,10.0,10.0,10.0,10.0,10.0,3.333333,10.0,minor problem,minor problem,...,no,male,Asian,Non-White,Black/Hispanic,Asian,Non-White,Liberal,8.0,Democrat


In [66]:
question5_df_numeric_values.to_csv('question_5_numeric_values.csv', index=False)

In [67]:
# Replace values
question5_df[['woman_not_encouraged_to_STEM',
              'woman_face_discrimination_hiring_in_STEM',
              'woman_less_interested_in_STEM',
              'woman_poor_family_conciliation_in_STEM',
              'women_discrimination_size_problem_in_STEM',
              'black_and_hispanics_not_encouraged_to_STEM',
              'black_and_hispanics_face_discrimination_hiring_in_STEM',
              'black_and_hispanics_less_interested_in_STEM',
              'black_and_hispanics_poor_access_to_quality_education',
              'black_and_hispanics_discrimination_size_problem_in_STEM']] = question5_df[['woman_not_encouraged_to_STEM',
                                                                            'woman_face_discrimination_hiring_in_STEM',
                                                                            'woman_less_interested_in_STEM',
                                                                            'woman_poor_family_conciliation_in_STEM',
                                                                            'women_discrimination_size_problem_in_STEM',
                                                                            'black_and_hispanics_not_encouraged_to_STEM',
                                                                            'black_and_hispanics_face_discrimination_hiring_in_STEM',
                                                                            'black_and_hispanics_less_interested_in_STEM',
                                                                            'black_and_hispanics_poor_access_to_quality_education',
                                                                            'black_and_hispanics_discrimination_size_problem_in_STEM']].replace({1.: 'major problem',
                                                                                                                                                 2.: 'minor problem',
                                                                                                                                                 3.: 'no problem'})
question5_df[['importance_of_gender_diversity',
              'importance_of_ethnic_diversity']] =  question5_df[['importance_of_gender_diversity',
                                                                  'importance_of_ethnic_diversity']].replace({1.: 'extremely important',
                                                                                                              2.: 'very important',
                                                                                                              3.: 'somewhat important',
                                                                                                              4.: 'not too important',
                                                                                                              5.: 'not important'})
question5_df[['experienced_gender_discrimination',
              'experienced_race_discrimination']] =  question5_df[['experienced_gender_discrimination',
                                                                    'experienced_race_discrimination']].replace({0.: 'yes', 1.: 'no'})

In [68]:
question5_df.head()

Unnamed: 0,woman_not_encouraged_to_STEM,woman_face_discrimination_hiring_in_STEM,woman_less_interested_in_STEM,woman_poor_family_conciliation_in_STEM,black_and_hispanics_not_encouraged_to_STEM,black_and_hispanics_face_discrimination_hiring_in_STEM,black_and_hispanics_less_interested_in_STEM,black_and_hispanics_poor_access_to_quality_education,women_discrimination_size_problem_in_STEM,black_and_hispanics_discrimination_size_problem_in_STEM,...,STEM_PROFESSION,GENDER,ETHNICITY,ETHNICITY_GROUPED,ETHNICITY_GROUPED_BLACK-HISPANICS,RACE,RACE_GROUPED,IDEOLOGY,CONSERVATIVE-LIBERAL,PARTY
0,major problem,major problem,no problem,major problem,major problem,major problem,no problem,major problem,major problem,major problem,...,yes,female,White,White,Black/Hispanic,White,White,Very liberal,5.0,Democrat
1,minor problem,minor problem,no problem,major problem,major problem,major problem,no problem,major problem,major problem,minor problem,...,yes,female,White,White,Black/Hispanic,White,White,Liberal,4.0,Democrat
2,minor problem,minor problem,no problem,major problem,minor problem,major problem,no problem,major problem,major problem,major problem,...,no,female,Asian,Non-White,Black/Hispanic,Asian,Non-White,Liberal,4.0,Democrat
3,minor problem,minor problem,no problem,major problem,major problem,minor problem,minor problem,major problem,major problem,minor problem,...,yes,female,White,White,Black/Hispanic,White,White,Liberal,4.0,Democrat
4,major problem,major problem,major problem,major problem,major problem,major problem,no problem,major problem,minor problem,minor problem,...,no,male,Asian,Non-White,Black/Hispanic,Asian,Non-White,Liberal,4.0,Democrat


In [69]:
question5_df.shape

(3990, 27)

In [70]:
# Export to CSV file
question5_df.to_csv('question_5.csv', index=False)