In [1]:
# Libraries used
import pandas as pd
import numpy as np
import pyreadr

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
data = pyreadr.read_r('../01_data/minorities_discrimination_survey.RData')

# Printing the keys of the result
print(data.keys())

odict_keys(['min_dis'])


In [3]:
# Extract the required pandas data frame from RData file
df_full = data["min_dis"]

# View the first 5 entries of the dataset
df_full.head()

Unnamed: 0,za_nr,version,doi,country,DEGURBA,ALTURBA,C1,C2,C3,typint,...,dis5_4_other,dis12overall10,dis5overall9,redisOverall,res_stat,sec_res,SI03_2_H_stat,SI03_3_H,pweightadj,hweightadj
0,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,15.0,0.075758,0.115287
1,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,15.0,,0.115287
2,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,16.0,0.075758,0.057643
3,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643
4,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643


In [4]:
# Filter the subset to include only people seeking for a job (EU05=1 or EU07=1)
df_job_seekers = df_full[(df_full['EU05'] == 1) | (df_full['EU07'] == 1)]

df_job_seekers.head()

Unnamed: 0,za_nr,version,doi,country,DEGURBA,ALTURBA,C1,C2,C3,typint,...,dis5_4_other,dis12overall10,dis5overall9,redisOverall,res_stat,sec_res,SI03_2_H_stat,SI03_3_H,pweightadj,hweightadj
0,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,15.0,0.075758,0.115287
12,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,0.0,1.0,,5.0,2.0,0.0,19.0,0.075758,0.057643
16,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,17.0,0.075758,0.230573
17,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,14.0,0.075758,0.046115
22,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,10.0,0.075758,0.230573


In [5]:
# Count missing values
NA_count = df_job_seekers.isnull().sum()

# Set the display option to show all rows
# pd.set_option('display.max_rows', None)

# Calculate the proportion of missing values for all the columns
df_job_seekers.apply(pd.isnull).sum()/df_job_seekers.shape[0]

za_nr            0.000000
version          0.000000
doi              0.000000
country          0.000000
DEGURBA          0.000000
                   ...   
sec_res          0.297404
SI03_2_H_stat    0.000000
SI03_3_H         0.000000
pweightadj       0.000000
hweightadj       0.000000
Length: 1283, dtype: float64

In [6]:
# Check the number of observations after filtering for job seekers
print("Number of observations before filtering:", df_full.shape[0])
print("Number of observations after filtering for job seekers:", df_job_seekers.shape[0])

Number of observations before filtering: 77656
Number of observations after filtering for job seekers: 13406


In [7]:
df_job_seekers['SI01_04'].head()

0     2.0
12    1.0
16    2.0
17    2.0
22    2.0
Name: SI01_04, dtype: float64

In [8]:
# Manually select variables of interest

variables_of_interest = [

    # To be used for LABEL
    'EUD01_01', ### Discriminated when looking for work in the past 5 years: Skin colour 
    'EUD01_02', ### Discriminated when looking for work in the past 5 years:  [ethnic origin or immigrant background / ethnic origin (tailored to target group)] 
    'EUD01_03', ### Discriminated when looking for work in the past 5 years: Religion or religious beliefs 
    'EUD01_04', ### Discriminated when looking for work in the past 5 years: Age (such as being too young or too old) 
    'EUD01_05', ### Discriminated when looking for work in the past 5 years: Sex/gender (such as being a man or a woman) 
    'EUD01_06', ### Discriminated when looking for work in the past 5 years: Disability 
    'EUD01_07', ### Discriminated when looking for work in the past 5 years: Sexual orientation (such as being gay lesbian or bisexual) 
    'EUD01_08', ### Discriminated when looking for work in the past 5 years: Other (please specify) 
    'EUD01_09', ### Discriminated when looking for work in the past 5 years: I haven't felt discriminated against on any ground when looking for work in the past 5 years
    'EUD01_96', ### Discriminated when looking for work in the past 5 years: Refused  
    'EUD01_97', ### Discriminated when looking for work in the past 5 years: Doesn't understand the question 
    'EUD01_99', ### Discriminated when looking for work in the past 5 years: Don't know  

    
    # To be used as FEATURES DIRECTLY
    'id', # unique identification number for a respondent
    'EU05', # Are you currently looking for work?  (this doesn't need to be modified - can be removed from Leti's doc)
    'EU07', # Have you ever looked for work in the past 5 years in  [COUNTRY] (or since you have been in [country])?
    'Generation', # 1st or 2nd generation migrant <------------
    'HH02', # Age
    'HH03', # Gender
    'HH04', # Current situation (Job)
    'IN02', # Target group
    'IN05', # COUNTRY OF BIRTH (immigrants and their descendants) <------------
    'SI03_3_H', # Household combined net monthly income (in income bands)  <------------

    
    # To be used as FEATURES AFTER TRANSFORMATION (Discrete/Binary)

    # Religion (small change to variable)
    'PB01', # What is your religion?

    # Current marital status
    'S01', # What is your current marital status? 

    ## Frequency of specific experiences of harassment in country in past 5 years 
    'VH01a_1', ### In the past 5 years in [COUNTRY], how many times has somebody made offensive or threatening comments to you in person such as insulting you or calling you names? 
    'VH01a_2', ### In the past 5 years in [COUNTRY], how many times has somebody threatened you with violence in person? 
    'VH01a_3', ### In the past 5 years in [COUNTRY], how many times has somebody made offensive gestures to you or stared at you inappropriately? 
    'VH01a_4', ### In the past 5 years in [COUNTRY], how many times has somebody sent you emails or text messages (SMS) that were offensive or threatening? 
    'VH01a_5', ### In the past 5 years in [COUNTRY], how many times has somebody posted offensive comments about you on the internet, for example on Facebook or Twitter?? 

    ## Extent of discrimination on different grounds in country
    'RA03_1', # Prevalance of discrimination on the basis of skin colour in [country]
    'RA03_2', # Prevalance of discrimination on the basis of ethnic origin or immigrant background in [country]
    'RA03_3', # Prevalance of discrimination on the basis of religion or religious beliefs in [country]
    
    # Ability of household to make ends meet
    'SI06', # Thinking of your household's total income, is your household able to make ends meet?

    # Housing
    'HLS02', # Do you own or rent this accommodation?

    # Urbanisation
    'DEGURBA', # degree of urbanisation

    # Health
    'DHE01', ### How is your health in general?
    'DHE02', ### Do you have any longstanding illness or health problem?
    'DHE03', ### For at least the past six months, to what extent have you been limited because of a health problem in activities people usually do? 
    'DHE04', ### For at least the past six months, to what extent have you been limited because of a health problem in activities people usually do? 

     # Trying to rent or buy an apartment or a house in country in past 5 years
    'DHO01', ### In the past 5 years in  [COUNTRY] have you ever tried to rent or buy an apartment or a house?

    # Awareness of experiences of violence among friends and family because of ethnic minority background in past 12 months 
    'VV10_1', ### In the past 12 months, have you heard of anyone in your circle of family or friends being insulted or called names because of their [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background?
    'VV10_2', ### In the past 12 months, have you heard of anyone in your circle of family or friends being physically attacked because of their [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background?

    # Awareness of support organisations in country
    'RA04', # Do you know of any organisations in [COUNTRY] that offer support or advice to people who have been discriminated against - for whatever reason?

    # Interethnic relationships + neighbourhood
    'PB10_1', # Do you have friends who are of another ethnic minority background than you?
    'PB10_2', # Do you have friends who do not have a minority background? 
    'PB11', # In the neighbourhood where you live how many of the residents would you say are [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background as you: all of the residents most of them some or none of them?

    # Social transfers + poverty
    'arop', # At risk of poverty after social transfers
    'SI01_04', # Income from education allowance (grants, stipends)  <------------
    'SI01_05', # Pensions                                            <------------
    'SI01_06', # Unemployment benefits                               <------------
    'SI01_07', # Child benefits (including alimonies)                <------------
    'SI01_08', # Other social benefits (social assistance, rent support, donations from charity) <------------

    # Reasons for coming to country (**asked only to non-EU/non-EFTA candidates**)
    'PR01_01', ### Reasons for coming to [COUNTRY]: Employment/work 
    'PR01_02', ### Reasons for coming to [COUNTRY]: Family reasons/joined a partner/marriage 
    'PR01_03', ### Reasons for coming to [COUNTRY]: Came with my parents when I was a child
    'PR01_04', ### Reasons for coming to [COUNTRY]:  Study
    'PR01_05', ### Reasons for coming to [COUNTRY]:  Seeking asylum/protection 
    'PR01_06', ### Reasons for coming to [COUNTRY]:  Other (please specify) 
    'PR01_07', ### Reasons for coming to [COUNTRY]:  I did not plan to stay in  [COUNTRY]
    'PR01_96', ### Reasons for coming to [COUNTRY]:  Refused 
    'PR01_97', ### Reasons for coming to [COUNTRY]:  Doesn't understand the question
    'PR01_99', ### Reasons for coming to [COUNTRY]:  Don't know 

    # Residence (needs to be imputed)
    'res_stat', ### Residence and citizenship status

    # Institutional experiences
    'DO24', ### In the past 5 years did any government official in  [COUNTRY] for example a customs or police officer a judge or an inspector ask you or expect you to pay a bribe for his or her services?  
    'DO27' ### In the past 5 years in [COUNTRY] (or since you have been in [country]) have you ever been stopped searched or questioned by the police?
]
    

#   'HH09_1', # Highest level of education completed? RECODE
#   'EA01', ### What is your current job or occupation?


### Not to be used:
    
#    'country', # country of interview
#    'country2', # don't know what this is, but it's almost 
#    'SI02', # And which of these is the main source of income in your household? 
#    'activ', # Self-declared main activity status 
#    'Encount_police_5y', # Stopped by the police in the past 5 years


## Pending: Review the labels to include and create new variables of the composed questions

In [19]:
# Subset the dataframe with selected variables
df_subset_job_seekers = df_job_seekers[variables_of_interest]

# Perform exploratory data analysis on the subset dataframe
df_subset_job_seekers.head()

Unnamed: 0,EUD01_01,EUD01_02,EUD01_03,EUD01_04,EUD01_05,EUD01_06,EUD01_07,EUD01_08,EUD01_09,EUD01_96,...,PR01_04,PR01_05,PR01_06,PR01_07,PR01_96,PR01_97,PR01_99,res_stat,DO24,DO27
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,3.0,1.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,1.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,1.0
17,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,1.0


### Variables to be used as FEATURES AFTER TRANSFORMATION (into discrete/binary)

**Religion**   
**PB01** - What is your religion?  
**PB01_mod** - Categorical  
0 - No religion  
1 - Christians  
2 - Muslims

In [75]:
# Check for missing values in variable PB01
print("Number of missing values in PB01:", df_subset_job_seekers['PB01'].isnull().sum())
print(df_subset_job_seekers['PB01'].value_counts())  # Print frequency table

# Create new variable PB01_mod based on conditions
df_subset_job_seekers.loc[:, 'PB01_mod'] = df_subset_job_seekers['PB01'].apply(lambda x: 0 if x in [96, 97, 99] else x)
print(df_subset_job_seekers['PB01_mod'].value_counts())  # Print frequency table

Number of missing values in PB01: 0
PB01
2.0     6384
1.0     5528
0.0     1095
99.0     399
Name: count, dtype: int64
PB01_mod
2.0    6384
1.0    5528
0.0    1494
Name: count, dtype: int64


**Current marital status**  
**S01** - What is your current marital status?  
**S01_mod** - Binary  
1 - Married  
2 - Not married

In [76]:
# Check for missing values in variable S01
print("Number of missing values in S01:", df_subset_job_seekers['S01'].isnull().sum())
print(df_subset_job_seekers['S01'].value_counts())  # Print frequency table

# Create new variable S01_mod based on condition
df_subset_job_seekers.loc[:, 'S01_mod'] = np.where(df_subset_job_seekers['S01'] == 1, 1, 0)
print(df_subset_job_seekers['S01_mod'].value_counts())  # Print frequency table

Number of missing values in S01: 0
S01
1.0     6728
5.0     5124
3.0      792
2.0      392
4.0      264
96.0      91
99.0       9
97.0       6
Name: count, dtype: int64
S01_mod
1    6728
0    6678
Name: count, dtype: int64


**Frequency of specific experiences of harassment in country in past 5 years**  
**VH01a_1** - Made offensive or threatening comments to you in person such as insulting you or calling you names?  
**VH01a_2** - Threatened you with violence in person?  
**VH01a_3** - Made offensive gestures to you or stared at you inappropriately?  
**VH01a_4** - Sent you emails or text messages (SMS) that were offensive or threatening?  
**VH01a_5** - Posted offensive comments about you on the internet, for example on Facebook or Twitter?  

**VH01a_1_mod, VH01a_2_mod, VH01a_3_mod, VH01a_4_mod, VH01a_5_mod** - Categorical    

0 - Did not happen/0 times  
1 - Once  
2 - Twice  
3 - Three times  
4 - Four times  
5 - Five times  
6 - 6 to 10 times  
7 - More than 10 times  
8 - All the time (daily)  

Create a new variable to get rid of any possible correlation:  
**VH01a** - Categorical (frequency of harrasment experiences)  ------ doesn't work

In [77]:
# Check for missing values in variables
print("Number of missing values in VH01a_1:", df_subset_job_seekers['VH01a_1'].isnull().sum())
print("Number of missing values in VH01a_2:", df_subset_job_seekers['VH01a_2'].isnull().sum())
print("Number of missing values in VH01a_3:", df_subset_job_seekers['VH01a_3'].isnull().sum())
print("Number of missing values in VH01a_4:", df_subset_job_seekers['VH01a_4'].isnull().sum())
print("Number of missing values in VH01a_5:", df_subset_job_seekers['VH01a_5'].isnull().sum())

# Concatenate frequency tables horizontally
VH01a_values = pd.concat([df_subset_job_seekers['VH01a_1'].value_counts().rename('VH01a_1_count'),
                          df_subset_job_seekers['VH01a_2'].value_counts().rename('VH01a_2_count'),
                          df_subset_job_seekers['VH01a_3'].value_counts().rename('VH01a_3_count'),
                          df_subset_job_seekers['VH01a_4'].value_counts().rename('VH01a_4_count'),
                          df_subset_job_seekers['VH01a_5'].value_counts().rename('VH01a_5_count')], axis = 1)

# Display horizontally
display(VH01a_values)

# Create new variable VH01a_1_mod based on conditions
df_subset_job_seekers.loc[:, 'VH01a_1_mod'] = df_subset_job_seekers['VH01a_1'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_2_mod'] = df_subset_job_seekers['VH01a_2'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_3_mod'] = df_subset_job_seekers['VH01a_3'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_4_mod'] = df_subset_job_seekers['VH01a_4'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_5_mod'] = df_subset_job_seekers['VH01a_5'].apply(lambda x: 0 if x in [96, 97, 99] else x)


# Concatenate frequency tables horizontally
VH01a_mod_values = pd.concat([df_subset_job_seekers['VH01a_1_mod'].value_counts().rename('VH01a_1_mod_count'),
                              df_subset_job_seekers['VH01a_2_mod'].value_counts().rename('VH01a_2_mod_count'),
                              df_subset_job_seekers['VH01a_3_mod'].value_counts().rename('VH01a_3_mod_count'),
                              df_subset_job_seekers['VH01a_4_mod'].value_counts().rename('VH01a_4_mod_count'),
                              df_subset_job_seekers['VH01a_5_mod'].value_counts().rename('VH01a_5_mod_count')], axis = 1)

# Display horizontally
display(VH01a_mod_values)

Number of missing values in VH01a_1: 0
Number of missing values in VH01a_2: 0
Number of missing values in VH01a_3: 0
Number of missing values in VH01a_4: 0
Number of missing values in VH01a_5: 0


Unnamed: 0,VH01a_1_count,VH01a_2_count,VH01a_3_count,VH01a_4_count,VH01a_5_count
0.0,8499,11580,8406,12771,12673
1.0,1082,737,766,172,147
2.0,881,335,766,104,86
7.0,756,124,1023,67,95
3.0,593,203,554,53,51
6.0,401,80,466,35,40
5.0,352,94,392,32,33
4.0,300,77,277,23,20
99.0,246,84,251,67,143
8.0,209,36,430,15,30


Unnamed: 0,VH01a_1_mod_count,VH01a_2_mod_count,VH01a_3_mod_count,VH01a_4_mod_count,VH01a_5_mod_count
0.0,8832,11720,8732,12905,12904
1.0,1082,737,766,172,147
2.0,881,335,766,104,86
7.0,756,124,1023,67,95
3.0,593,203,554,53,51
6.0,401,80,466,35,40
5.0,352,94,392,32,33
4.0,300,77,277,23,20
8.0,209,36,430,15,30


**Extent of discrimination on different grounds in country**  
**RA03_1** - Prevalance of discrimination on the basis of skin colour  
**RA03_2** - Prevalance of discrimination on the basis of ethnic origin or immigrant background  
**RA03_3** - Prevalance of discrimination on the basis of religion or religious beliefs  

**RA03_1_mod**, **RA03_2_mod**, **RA03_3_mod** - Categorical  

0 - No discrimination  
1 - Very rare  
2 - Fairly rare  
3 - Fairly widespread  
4 - Very widespread  

In [78]:
# Check for missing values in variables
print("Number of missing values in RA03_1:", df_subset_job_seekers['RA03_1'].isnull().sum())
print("Number of missing values in RA03_2:", df_subset_job_seekers['RA03_2'].isnull().sum())
print("Number of missing values in RA03_3:", df_subset_job_seekers['RA03_3'].isnull().sum())


# Concatenate frequency tables horizontally
RA03_values = pd.concat([df_subset_job_seekers['RA03_1'].value_counts().rename('RA03_1_count'),
                          df_subset_job_seekers['RA03_2'].value_counts().rename('RA03_2_count'),
                          df_subset_job_seekers['RA03_3'].value_counts().rename('RA03_3_count')], axis = 1)

# Display horizontally
display(RA03_values)

# Create new variable RA03_1_mod based on conditions
df_subset_job_seekers.loc[:, 'RA03_1_mod'] = df_subset_job_seekers['RA03_1'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)
df_subset_job_seekers.loc[:, 'RA03_2_mod'] = df_subset_job_seekers['RA03_2'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)
df_subset_job_seekers.loc[:, 'RA03_3_mod'] = df_subset_job_seekers['RA03_3'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)

# Concatenate frequency tables horizontally
RA03_mod_values = pd.concat([df_subset_job_seekers['RA03_1_mod'].value_counts().rename('RA03_1_mod_count'),
                             df_subset_job_seekers['RA03_2_mod'].value_counts().rename('RA03_2_mod_count'),
                             df_subset_job_seekers['RA03_3_mod'].value_counts().rename('RA03_3_mod_count')], axis = 1)

# Display horizontally
display(RA03_mod_values)

Number of missing values in RA03_1: 0
Number of missing values in RA03_2: 0
Number of missing values in RA03_3: 0


Unnamed: 0,RA03_1_count,RA03_2_count,RA03_3_count
3.0,3902,4211,2593
2.0,3286,3170,3006
1.0,3137,2771,4187
4.0,1974,2262,1845
5.0,908,774,1390
99.0,139,138,309
98.0,24,21,25
97.0,18,40,32
96.0,18,19,19


Unnamed: 0,RA03_1_mod_count,RA03_2_mod_count,RA03_3_mod_count
3.0,3902,4211,2593
2.0,3286,3170,3006
1.0,3137,2771,4187
4.0,1974,2262,1845
0.0,1107,992,1775


**Ability of household to make ends meet**  
**SI06** - Thinking of your household's total income, is your household able to make ends meet?  

**SI06_mod** - Binary  
0 - No  
1 - Yes  

In [79]:
# Check for missing values in variable SI06
print("Number of missing values in SI06:", df_subset_job_seekers['SI06'].isnull().sum())
print(df_subset_job_seekers['SI06'].value_counts())  # Print frequency table

# Create new variable SI06_mod based on conditions
df_subset_job_seekers.loc[:, 'SI06_mod'] = df_subset_job_seekers['SI06'].apply(lambda x: 1 if x >= 4 and x <= 6 else 0) # fairly easily, easily or very easily
print(df_subset_job_seekers['SI06_mod'].value_counts())  # Print frequency table

Number of missing values in SI06: 0
SI06
1.0     3934
3.0     3192
2.0     2776
4.0     2102
5.0      945
6.0      262
96.0      95
99.0      93
97.0       7
Name: count, dtype: int64
SI06_mod
0    10097
1     3309
Name: count, dtype: int64


In [91]:
#tab = pd.concat([df_subset_job_seekers['SI06'].value_counts().rename('SI06'),
#                 df_subset_job_seekers['SI06_mod'].value_counts().rename('SI06_mod')], axis = 1)
#tab = tab.fillna('-')
#display(tab)

**Housing**  
HLS02 and DHO01

**HLS02** - Do you own or rent this accommodation?  
**HLS02_mod** - Binary  
0 - Rent  
1 - Own  

In [92]:
# Check for missing values in variable HLS02
print("Number of missing values in HLS02:", df_subset_job_seekers['HLS02'].isnull().sum())
print(df_subset_job_seekers['HLS02'].value_counts())  # Print frequency table

# Create new variable HLS02_mod based on conditions
df_subset_job_seekers.loc[:, 'HLS02_mod'] = df_subset_job_seekers['HLS02'].apply(lambda x: 1 if x == 1 or x == 2 else 0) # own with or without mortgage
print(df_subset_job_seekers['HLS02_mod'].value_counts())  # Print frequency table

Number of missing values in HLS02: 2
HLS02
4.0     4212
3.0     4188
1.0     2667
2.0     1048
5.0      985
6.0      178
96.0      84
99.0      39
97.0       3
Name: count, dtype: int64
HLS02_mod
0    9691
1    3715
Name: count, dtype: int64


**DHO01** - In the past 5 years, have you ever tried to rent or buy an apartment or a house?  
**DHO01_mod** - Binary  
0 - No  
1 - Yes  

In [93]:
# Check for missing values in variable DHO01
print("Number of missing values in DHO01:", df_subset_job_seekers['DHO01'].isnull().sum())
print(df_subset_job_seekers['DHO01'].value_counts())  # Print frequency table

# Create new variable DHO01_mod based on condition
df_subset_job_seekers.loc[:, 'DHO01_mod'] = np.where(df_subset_job_seekers['DHO01'] == 1, 1, 0)
print(df_subset_job_seekers['DHO01_mod'].value_counts())  # Print frequency table

Number of missing values in DHO01: 0
DHO01
2.0     8148
1.0     5216
96.0      17
97.0      14
99.0      11
Name: count, dtype: int64
DHO01_mod
0    8190
1    5216
Name: count, dtype: int64


**Urbanisation**  
**DEGURBA** - degree of urbanisation  
**DEGURBA_mod** - Discrete  


In [95]:
# Check for missing values in variable DEGURBA
print("Number of missing values in DEGURBA:", df_subset_job_seekers['DEGURBA'].isnull().sum())
print(df_subset_job_seekers['DEGURBA'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DEGURBA = {1:3, 3:1}

# Create new variable DEGURBA_mod based on the mapping
df_subset_job_seekers.loc[:, 'DEGURBA_mod'] = df_subset_job_seekers['DEGURBA'].map(map_DEGURBA).fillna(df_subset_job_seekers['DEGURBA'])
print(df_subset_job_seekers['DEGURBA_mod'].value_counts())

Number of missing values in DEGURBA: 0
DEGURBA
1.0    8233
2.0    2789
3.0    2384
Name: count, dtype: int64
DEGURBA_mod
3.0    8233
2.0    2789
1.0    2384
Name: count, dtype: int64


**Health**  
DHE01, DHE02, DHE03, DHE04

**DHE01** - How is your health in general?  
**DHE01_mod** - Discrete  

1 - Very good  
2 - Good  
3 - Fair  
4 - Bad  
5 - Very bad  

In [97]:
# Check for missing values in variable DHE01
print("Number of missing values in DHE01:", df_subset_job_seekers['DHE01'].isnull().sum())
print(df_subset_job_seekers['DHE01'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DHE01 = {96: 0, 97: 0, 99: 0, 1: 5, 5: 1, 2: 4, 4: 2}

# Create new variable DHE01_mod based on the mapping
df_subset_job_seekers.loc[:, 'DHE01_mod'] = df_subset_job_seekers['DHE01'].map(map_DHE01).fillna(df_subset_job_seekers['DHE01'])
print(df_subset_job_seekers['DHE01_mod'].value_counts())  # Print frequency table
# Growing from non-existing answer (0) and very bad health (1) to very good health (5)

Number of missing values in DHE01: 0
DHE01
1.0     5642
2.0     5230
3.0     1807
4.0      596
5.0      120
96.0       7
99.0       3
97.0       1
Name: count, dtype: int64
DHE01_mod
5.0    5642
4.0    5230
3.0    1807
2.0     596
1.0     120
0.0      11
Name: count, dtype: int64


**DHE02** - Do you have any longstanding illness or health problem?  
**DHE02_mod** - Binary  
0 - No  
1 - Yes  

In [105]:
# Check for missing values in variable DHE02
print("Number of missing values in DHE02:", df_subset_job_seekers['DHE02'].isnull().sum())
print(df_subset_job_seekers['DHE02'].value_counts())  # Print frequency table

# Create new variable DHE02_mod based on condition
df_subset_job_seekers.loc[:, 'DHE02_mod'] = np.where(df_subset_job_seekers['DHE02'] == 1, 1, 0)
print(df_subset_job_seekers['DHE02_mod'].value_counts())  # Print frequency table

Number of missing values in DHE02: 0
DHE02
2.0     11299
1.0      2033
96.0       48
99.0       20
97.0        6
Name: count, dtype: int64
DHE02_mod
0    11373
1     2033
Name: count, dtype: int64


**DHE03** - For at least the past six months, to what extent have you been limited because of a health problem in activities people usually do?  
**DHE03_mod** - Discrete  
1 - Severely limited  
2 - Limited but not severely  
3 - Not limited at all  

In [103]:
# Check for missing values in variable DHE03
print("Number of missing values in DHE03:", df_subset_job_seekers['DHE03'].isnull().sum())
print(df_subset_job_seekers['DHE03'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DHE03 = {96:3, 97:3, 99:3}

# Create new variable DHE03_mod based on the mapping
df_subset_job_seekers.loc[:, 'DHE03_mod'] = df_subset_job_seekers['DHE03'].map(map_DHE03).fillna(df_subset_job_seekers['DHE03'])
print(df_subset_job_seekers['DHE03_mod'].value_counts())  # Print frequency table

Number of missing values in DHE03: 0
DHE03
3.0     11180
2.0      1638
1.0       521
96.0       35
99.0       22
97.0       10
Name: count, dtype: int64
DHE03_mod
3.0    11247
2.0     1638
1.0      521
Name: count, dtype: int64


**DHE04** - Does the [NATIONAL BASIC HEALTH INSURANCE SCHEME] currently cover your health care expenses?  
**DHE04_mod** - Binary  
0 - No  
1 - Yes  

In [104]:
# Check for missing values in variable DHE04
print("Number of missing values in DHE04:", df_subset_job_seekers['DHE04'].isnull().sum())
print(df_subset_job_seekers['DHE04'].value_counts())  # Print frequency table

# Create new variable DHE04_mod based on condition
df_subset_job_seekers.loc[:, 'DHE04_mod'] = np.where(df_subset_job_seekers['DHE04'] == 1, 1, 0)
print(df_subset_job_seekers['DHE04_mod'].value_counts())  # Print frequency table

Number of missing values in DHE04: 0
DHE04
1.0     11009
2.0      1979
99.0      299
97.0       76
96.0       43
Name: count, dtype: int64
DHE04_mod
1    11009
0     2397
Name: count, dtype: int64


**Awareness of experiences of violence among friends and family because of ethnic minority background in past 12 months**  
**VV10_1** - Have you heard of anyone in your circle of family or friends being insulted or called names?  
**VV10_2** - Have you heard of anyone in your circle of family or friends being physically attacked?  

**Awareness of support organisations in country**  
**RA04** - Do you know of any organisations in [COUNTRY] that offer support or advice to people who have been discriminated against - for whatever reason?  
  

  

  



**Interethnic relationships + neighbourhood**  
**PB10_1** - Do you have friends who are of another ethnic minority background than you?  
**PB10_2** - Do you have friends who do not have a minority background?   
**PB11** - In the neighbourhood where you live how many of the residents would you say are [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background as you: all of the residents most of them some or none of them?  

**Social transfers + poverty** 
**arop** - At risk of poverty after social transfers  
**SI01_04** - Income from education allowance (grants, stipends)  <------------  
**SI01_05** - Pensions                                            <------------  
**SI01_06** - Unemployment benefits                               <------------  
**SI01_07** - Child benefits (including alimonies)                <------------  
**SI01_08** - Other social benefits (social assistance, rent support, donations from charity) <------------  

**Reasons for coming to country** (**asked only to non-EU/non-EFTA candidates**)  
**PR01_01** - Employment/work   
**PR01_02** - Family reasons/joined a partner/marriage   
**PR01_03** - Came with my parents when I was a child  
**PR01_04** - Study  
**PR01_05** - Seeking asylum/protection  
**PR01_06** - Other (please specify)  
**PR01_07** - I did not plan to stay in  [COUNTRY]  
**PR01_96** - Refused  
**PR01_97** - Doesn't understand the question  
**PR01_99** - Don't know  

**Residence** (needs to be imputed)  
**res_stat** - Residence and citizenship status  




**Institutional experiences**  
**DO24** - In the past 5 years did any government official in  [COUNTRY] for example a customs or police officer a judge or an inspector ask you or expect you to pay a bribe for his or her services?    
**DO27** - In the past 5 years in [COUNTRY] (or since you have been in [country]) have you ever been stopped searched or questioned by the police?  