In [1]:
# Libraries used
import pandas as pd
import numpy as np
import pyreadr

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None  # Suppress the warnings

In [2]:
# Load the dataset
data = pyreadr.read_r('../01_data/minorities_discrimination_survey.RData')

# Printing the keys of the result
print(data.keys())

odict_keys(['min_dis'])


In [3]:
# Extract the required pandas data frame from RData file
df_full = data["min_dis"]

# View the first 5 entries of the dataset
df_full.head()

Unnamed: 0,za_nr,version,doi,country,DEGURBA,ALTURBA,C1,C2,C3,typint,...,dis5_4_other,dis12overall10,dis5overall9,redisOverall,res_stat,sec_res,SI03_2_H_stat,SI03_3_H,pweightadj,hweightadj
0,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,15.0,0.075758,0.115287
1,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,15.0,,0.115287
2,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,16.0,0.075758,0.057643
3,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643
4,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643


In [4]:
#'EU05', # Are you currently looking for work?  (this doesn't need to be modified - can be removed from Leti's doc)
#'EU07', # Have you ever looked for work in the past 5 years in  [COUNTRY] (or since you have been in [country])?

# Filter the subset to include only people seeking for a job (EU05=1 or EU07=1)
df_job_seekers = df_full[(df_full['EU05'] == 1) | (df_full['EU07'] == 1)]

df_job_seekers.head()

Unnamed: 0,za_nr,version,doi,country,DEGURBA,ALTURBA,C1,C2,C3,typint,...,dis5_4_other,dis12overall10,dis5overall9,redisOverall,res_stat,sec_res,SI03_2_H_stat,SI03_3_H,pweightadj,hweightadj
0,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,15.0,0.075758,0.115287
12,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,0.0,1.0,,5.0,2.0,0.0,19.0,0.075758,0.057643
16,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,17.0,0.075758,0.230573
17,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,14.0,0.075758,0.046115
22,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,,,,2.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,10.0,0.075758,0.230573


In [5]:
# Drop variables used to filter for job seekers
df_job_seekers.drop(['EU05', 'EU07'], axis = 1, inplace = True) 

df_job_seekers.isnull().sum()

za_nr               0
version             0
doi                 0
country             0
DEGURBA             0
                 ... 
sec_res          3987
SI03_2_H_stat       0
SI03_3_H            0
pweightadj          0
hweightadj          0
Length: 1281, dtype: int64

In [6]:
# Count missing values
NA_count = df_job_seekers.isnull().sum()

# Set the display option to show all rows
# pd.set_option('display.max_rows', None)

# Calculate the proportion of missing values for all the columns
df_job_seekers.apply(pd.isnull).sum()/df_job_seekers.shape[0]

za_nr            0.000000
version          0.000000
doi              0.000000
country          0.000000
DEGURBA          0.000000
                   ...   
sec_res          0.297404
SI03_2_H_stat    0.000000
SI03_3_H         0.000000
pweightadj       0.000000
hweightadj       0.000000
Length: 1281, dtype: float64

In [7]:
# Check the number of observations after filtering for job seekers
print("Number of observations before filtering:", df_full.shape[0])
print("Number of observations after filtering for job seekers:", df_job_seekers.shape[0])

df_full.shape

Number of observations before filtering: 77656
Number of observations after filtering for job seekers: 13406


(77656, 1283)

In [8]:
# Manually select variables of interest

variables_of_interest = [

    # To be used for LABEL
    'EUD01_01', ### Discriminated when looking for work in the past 5 years: Skin colour 
    'EUD01_02', ### Discriminated when looking for work in the past 5 years:  [ethnic origin or immigrant background / ethnic origin (tailored to target group)] 
    'EUD01_03', ### Discriminated when looking for work in the past 5 years: Religion or religious beliefs 
    'EUD01_04', ### Discriminated when looking for work in the past 5 years: Age (such as being too young or too old) 
    'EUD01_05', ### Discriminated when looking for work in the past 5 years: Sex/gender (such as being a man or a woman) 
    'EUD01_06', ### Discriminated when looking for work in the past 5 years: Disability 
    'EUD01_07', ### Discriminated when looking for work in the past 5 years: Sexual orientation (such as being gay lesbian or bisexual) 
    'EUD01_08', ### Discriminated when looking for work in the past 5 years: Other (please specify) 
    'EUD01_09', ### Discriminated when looking for work in the past 5 years: I haven't felt discriminated against on any ground when looking for work in the past 5 years
    'EUD01_96', ### Discriminated when looking for work in the past 5 years: Refused  
    'EUD01_97', ### Discriminated when looking for work in the past 5 years: Doesn't understand the question 
    'EUD01_99', ### Discriminated when looking for work in the past 5 years: Don't know  

    
    # To be used as FEATURES DIRECTLY
    'id', # unique identification number for a respondent
    'country', # country of interview
    'HH02', # Age
    'HH03', # Gender
    'HH04', # Current situation (Job)
    'IN02', # Target group
    'SI03_3_H', # Household combined net monthly income (in income bands)  <------------
    'SI03_2_H_stat',

    
    # To be used as FEATURES AFTER TRANSFORMATION (Discrete/Binary)

    # Religion (small change to variable)
    'PB01', # What is your religion?

    # Current marital status
    'S01', # What is your current marital status? 

    ## Frequency of specific experiences of harassment in country in past 5 years 
    'VH01a_1', ### In the past 5 years in [COUNTRY], how many times has somebody made offensive or threatening comments to you in person such as insulting you or calling you names? 
    'VH01a_2', ### In the past 5 years in [COUNTRY], how many times has somebody threatened you with violence in person? 
    'VH01a_3', ### In the past 5 years in [COUNTRY], how many times has somebody made offensive gestures to you or stared at you inappropriately? 
    'VH01a_4', ### In the past 5 years in [COUNTRY], how many times has somebody sent you emails or text messages (SMS) that were offensive or threatening? 
    'VH01a_5', ### In the past 5 years in [COUNTRY], how many times has somebody posted offensive comments about you on the internet, for example on Facebook or Twitter?? 

    ## Extent of discrimination on different grounds in country
    'RA03_1', # Prevalance of discrimination on the basis of skin colour in [country]
    'RA03_2', # Prevalance of discrimination on the basis of ethnic origin or immigrant background in [country]
    'RA03_3', # Prevalance of discrimination on the basis of religion or religious beliefs in [country]
    
    # Ability of household to make ends meet
    'SI06', # Thinking of your household's total income, is your household able to make ends meet?

    # Housing
    'HLS02', # Do you own or rent this accommodation?

    # Urbanisation
    'DEGURBA', # degree of urbanisation

    # Health
    'DHE01', ### How is your health in general?
    'DHE02', ### Do you have any longstanding illness or health problem?
    'DHE03', ### For at least the past six months, to what extent have you been limited because of a health problem in activities people usually do? 
    'DHE04', ### Does the [NATIONAL BASIC HEALTH INSURANCE SCHEME] currently cover your health care expenses?

     # Trying to rent or buy an apartment or a house in country in past 5 years
    'DHO01', ### In the past 5 years in  [COUNTRY] have you ever tried to rent or buy an apartment or a house?

    # Awareness of experiences of violence among friends and family because of ethnic minority background in past 12 months 
    'VV10_1', ### In the past 12 months, have you heard of anyone in your circle of family or friends being insulted or called names because of their [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background?
    'VV10_2', ### In the past 12 months, have you heard of anyone in your circle of family or friends being physically attacked because of their [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background?

    # Awareness of support organisations in country
    'RA04', # Do you know of any organisations in [COUNTRY] that offer support or advice to people who have been discriminated against - for whatever reason?

    # Interethnic relationships + neighbourhood
    'PB10_1', # Do you have friends who are of another ethnic minority background than you?
    'PB10_2', # Do you have friends who do not have a minority background? 
    'PB11', # In the neighbourhood where you live how many of the residents would you say are [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background as you: all of the residents most of them some or none of them?

    # Social transfers + poverty
    'arop', # At risk of poverty after social transfers
    'SI01_04', # Income from education allowance (grants, stipends)  <------------
    'SI01_05', # Pensions                                            <------------
    'SI01_06', # Unemployment benefits                               <------------
    'SI01_07', # Child benefits (including alimonies)                <------------
    'SI01_08', # Other social benefits (social assistance, rent support, donations from charity) <------------

    # Reasons for coming to country (**asked only to non-EU/non-EFTA candidates**)
    'PR01_01', ### Reasons for coming to [COUNTRY]: Employment/work 
    'PR01_02', ### Reasons for coming to [COUNTRY]: Family reasons/joined a partner/marriage 
    'PR01_03', ### Reasons for coming to [COUNTRY]: Came with my parents when I was a child
    'PR01_04', ### Reasons for coming to [COUNTRY]:  Study
    'PR01_05', ### Reasons for coming to [COUNTRY]:  Seeking asylum/protection 
    'PR01_06', ### Reasons for coming to [COUNTRY]:  Other (please specify) 
    'PR01_07', ### Reasons for coming to [COUNTRY]:  I did not plan to stay in  [COUNTRY]
    'PR01_96', ### Reasons for coming to [COUNTRY]:  Refused 
    'PR01_97', ### Reasons for coming to [COUNTRY]:  Doesn't understand the question
    'PR01_99', ### Reasons for coming to [COUNTRY]:  Don't know 

    # Residence (needs to be imputed)
    'res_stat', ### Residence and citizenship status

    # Institutional experiences
    'DO27' ### In the past 5 years in [COUNTRY] (or since you have been in [country]) have you ever been stopped searched or questioned by the police?
]
    

In [9]:
# Subset the dataframe with selected variables
df_subset_job_seekers = df_job_seekers[variables_of_interest]

# Perform exploratory data analysis on the subset dataframe
df_subset_job_seekers.head()

Unnamed: 0,EUD01_01,EUD01_02,EUD01_03,EUD01_04,EUD01_05,EUD01_06,EUD01_07,EUD01_08,EUD01_09,EUD01_96,...,PR01_03,PR01_04,PR01_05,PR01_06,PR01_07,PR01_96,PR01_97,PR01_99,res_stat,DO27
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0
17,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0


In [10]:
pd.set_option('display.max_rows', None)
df_subset_job_seekers.isnull().sum()

EUD01_01            0
EUD01_02            0
EUD01_03            0
EUD01_04            0
EUD01_05            0
EUD01_06            0
EUD01_07            0
EUD01_08            0
EUD01_09            0
EUD01_96            0
EUD01_97            0
EUD01_99            0
id                  0
country             0
HH02                0
HH03                0
HH04                0
IN02                0
SI03_3_H            0
SI03_2_H_stat       0
PB01                0
S01                 0
VH01a_1             0
VH01a_2             0
VH01a_3             0
VH01a_4             0
VH01a_5             0
RA03_1              0
RA03_2              0
RA03_3              0
SI06                0
HLS02               2
DEGURBA             0
DHE01               0
DHE02               0
DHE03               0
DHE04               0
DHO01               0
VV10_1              0
VV10_2              0
RA04                0
PB10_1              0
PB10_2              0
PB11                0
arop             2700
SI01_04   

# Label y:

Discriminated when looking for work in the past 5 years  
**EUD01_01, EUD01_02, EUD01_03, EUD01_04,  
EUD01_05, EUD01_06, EUD01_07, EUD01_08,  
EUD01_09, EUD01_96, EUD01_97, EUD01_99**  
  
**discrimination_occurred** - Binary  
0 - No   
1 - Yes

In [11]:
# Define the target variable
def check_discrimination(row):
    if any(row[col] == 1 for col in ['EUD01_09', 'EUD01_96', 'EUD01_97', 'EUD01_99']):
        return 0  # No discrimination occurred
    elif any(row[col] == 1 for col in ['EUD01_01', 'EUD01_02', 'EUD01_03', 'EUD01_04', 
                                       'EUD01_05', 'EUD01_06', 'EUD01_07', 'EUD01_08']):
        return 1  # Discrimination occurred

# Create the binary discrimination variable
df_subset_job_seekers.loc[:, 'discrimination_occurred'] = df_subset_job_seekers.apply(check_discrimination, axis = 1)

# Check distribution of the binary discrimination variable
print(df_subset_job_seekers['discrimination_occurred'].value_counts())

discrimination_occurred
0    8236
1    5170
Name: count, dtype: int64


In [12]:
# Drop variables used to construct discrimination_occurred
df_subset_job_seekers.drop(['EUD01_01', 'EUD01_02', 'EUD01_03', 'EUD01_04',
                            'EUD01_05', 'EUD01_06', 'EUD01_07', 'EUD01_08','EUD01_09', 'EUD01_96', 'EUD01_97', 'EUD01_99'], axis = 1, inplace = True) 

df_subset_job_seekers.isnull().sum()

id                            0
country                       0
HH02                          0
HH03                          0
HH04                          0
IN02                          0
SI03_3_H                      0
SI03_2_H_stat                 0
PB01                          0
S01                           0
VH01a_1                       0
VH01a_2                       0
VH01a_3                       0
VH01a_4                       0
VH01a_5                       0
RA03_1                        0
RA03_2                        0
RA03_3                        0
SI06                          0
HLS02                         2
DEGURBA                       0
DHE01                         0
DHE02                         0
DHE03                         0
DHE04                         0
DHO01                         0
VV10_1                        0
VV10_2                        0
RA04                          0
PB10_1                        0
PB10_2                        0
PB11    

# Variables to be used as FEATURES AFTER TRANSFORMATION (into discrete/binary)

# I.

<span style="color:red">**Religion**</span>   
**PB01** - What is your religion?  
**PB01_mod** - Categorical  
0 - No religion  
1 - Christians  
2 - Muslims

In [13]:
# Check for missing values in variable PB01
print("Number of missing values in PB01:", df_subset_job_seekers['PB01'].isnull().sum())
print(df_subset_job_seekers['PB01'].value_counts())  # Print frequency table

# Create new variable PB01_mod based on conditions
df_subset_job_seekers.loc[:, 'PB01_mod'] = df_subset_job_seekers['PB01'].apply(lambda x: 0 if x in [96, 97, 99] else x)
print(df_subset_job_seekers['PB01_mod'].value_counts())  # Print frequency table




Number of missing values in PB01: 0
PB01
2.0     6384
1.0     5528
0.0     1095
99.0     399
Name: count, dtype: int64
PB01_mod
2.0    6384
1.0    5528
0.0    1494
Name: count, dtype: int64


# II.

<span style="color:red">**Current marital status**</span>  
**S01** - What is your current marital status?  
**S01_mod** - Binary  
1 - Married  
2 - Not married

In [14]:
# Check for missing values in variable S01
print("Number of missing values in S01:", df_subset_job_seekers['S01'].isnull().sum())
print(df_subset_job_seekers['S01'].value_counts())  # Print frequency table

# Create new variable S01_mod based on condition
df_subset_job_seekers.loc[:, 'S01_mod'] = np.where(df_subset_job_seekers['S01'] == 1, 1, 0)
print(df_subset_job_seekers['S01_mod'].value_counts())  # Print frequency table



Number of missing values in S01: 0
S01
1.0     6728
5.0     5124
3.0      792
2.0      392
4.0      264
96.0      91
99.0       9
97.0       6
Name: count, dtype: int64
S01_mod
1    6728
0    6678
Name: count, dtype: int64


# III.

<span style="color:red">**Frequency of specific experiences of harassment in country in past 5 years**</span>  
**VH01a_1** - Made offensive or threatening comments to you in person such as insulting you or calling you names?  
**VH01a_2** - Threatened you with violence in person?  
**VH01a_3** - Made offensive gestures to you or stared at you inappropriately?  
**VH01a_4** - Sent you emails or text messages (SMS) that were offensive or threatening?  
**VH01a_5** - Posted offensive comments about you on the internet, for example on Facebook or Twitter?  

**VH01a_1_mod, VH01a_2_mod, VH01a_3_mod, VH01a_4_mod, VH01a_5_mod** - Categorical    

0 - Did not happen/0 times  
1 - Once  
2 - Twice  
3 - Three times  
4 - Four times  
5 - Five times  
6 - 6 to 10 times  
7 - More than 10 times  
8 - All the time (daily)  

Create a new variable to get rid of any possible correlation:  
**VH01a** - Categorical (frequency of harrasment experiences)

In [15]:
# Check for missing values in variables
print("Number of missing values in VH01a_1:", df_subset_job_seekers['VH01a_1'].isnull().sum())
print("Number of missing values in VH01a_2:", df_subset_job_seekers['VH01a_2'].isnull().sum())
print("Number of missing values in VH01a_3:", df_subset_job_seekers['VH01a_3'].isnull().sum())
print("Number of missing values in VH01a_4:", df_subset_job_seekers['VH01a_4'].isnull().sum())
print("Number of missing values in VH01a_5:", df_subset_job_seekers['VH01a_5'].isnull().sum())

# Concatenate frequency tables horizontally
VH01a_values = pd.concat([df_subset_job_seekers['VH01a_1'].value_counts().rename('VH01a_1_count'),
                          df_subset_job_seekers['VH01a_2'].value_counts().rename('VH01a_2_count'),
                          df_subset_job_seekers['VH01a_3'].value_counts().rename('VH01a_3_count'),
                          df_subset_job_seekers['VH01a_4'].value_counts().rename('VH01a_4_count'),
                          df_subset_job_seekers['VH01a_5'].value_counts().rename('VH01a_5_count')], axis = 1)

# Display horizontally
display(VH01a_values)

# Create new variable VH01a_1_mod based on conditions
df_subset_job_seekers.loc[:, 'VH01a_1_mod'] = df_subset_job_seekers['VH01a_1'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_2_mod'] = df_subset_job_seekers['VH01a_2'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_3_mod'] = df_subset_job_seekers['VH01a_3'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_4_mod'] = df_subset_job_seekers['VH01a_4'].apply(lambda x: 0 if x in [96, 97, 99] else x)
df_subset_job_seekers.loc[:, 'VH01a_5_mod'] = df_subset_job_seekers['VH01a_5'].apply(lambda x: 0 if x in [96, 97, 99] else x)


# Concatenate frequency tables horizontally
VH01a_mod_values = pd.concat([df_subset_job_seekers['VH01a_1_mod'].value_counts().rename('VH01a_1_mod_count'),
                              df_subset_job_seekers['VH01a_2_mod'].value_counts().rename('VH01a_2_mod_count'),
                              df_subset_job_seekers['VH01a_3_mod'].value_counts().rename('VH01a_3_mod_count'),
                              df_subset_job_seekers['VH01a_4_mod'].value_counts().rename('VH01a_4_mod_count'),
                              df_subset_job_seekers['VH01a_5_mod'].value_counts().rename('VH01a_5_mod_count')], axis = 1)

# Display horizontally
display(VH01a_mod_values)


Number of missing values in VH01a_1: 0
Number of missing values in VH01a_2: 0
Number of missing values in VH01a_3: 0
Number of missing values in VH01a_4: 0
Number of missing values in VH01a_5: 0


Unnamed: 0,VH01a_1_count,VH01a_2_count,VH01a_3_count,VH01a_4_count,VH01a_5_count
0.0,8499,11580,8406,12771,12673
1.0,1082,737,766,172,147
2.0,881,335,766,104,86
7.0,756,124,1023,67,95
3.0,593,203,554,53,51
6.0,401,80,466,35,40
5.0,352,94,392,32,33
4.0,300,77,277,23,20
99.0,246,84,251,67,143
8.0,209,36,430,15,30


Unnamed: 0,VH01a_1_mod_count,VH01a_2_mod_count,VH01a_3_mod_count,VH01a_4_mod_count,VH01a_5_mod_count
0.0,8832,11720,8732,12905,12904
1.0,1082,737,766,172,147
2.0,881,335,766,104,86
7.0,756,124,1023,67,95
3.0,593,203,554,53,51
6.0,401,80,466,35,40
5.0,352,94,392,32,33
4.0,300,77,277,23,20
8.0,209,36,430,15,30


In [16]:
# Transform variable to remove correlation
df_subset_job_seekers.loc[:, 'VH01a'] = df_subset_job_seekers[['VH01a_1_mod', 'VH01a_2_mod', 'VH01a_3_mod', 'VH01a_4_mod', 'VH01a_5_mod']].sum(axis = 1)
print(df_subset_job_seekers['VH01a'].value_counts())  # Print frequency table


VH01a
0.0     7282
2.0      791
1.0      781
3.0      531
7.0      499
6.0      424
4.0      380
5.0      367
8.0      355
14.0     298
10.0     231
12.0     228
9.0      194
13.0     168
11.0     145
15.0     137
16.0     125
17.0      76
21.0      72
19.0      51
18.0      51
20.0      41
24.0      34
23.0      33
22.0      24
25.0      12
28.0      11
35.0      10
30.0      10
29.0       9
26.0       7
32.0       6
27.0       5
34.0       4
31.0       4
40.0       4
33.0       3
37.0       2
39.0       1
Name: count, dtype: int64


# IV.

<span style="color:red">**Extent of discrimination on different grounds in country**</span>  
**RA03_1** - Prevalance of discrimination on the basis of skin colour  
**RA03_2** - Prevalance of discrimination on the basis of ethnic origin or immigrant background  
**RA03_3** - Prevalance of discrimination on the basis of religion or religious beliefs  

**RA03_1_mod**, **RA03_2_mod**, **RA03_3_mod** - Categorical  

0 - No discrimination  
1 - Very rare  
2 - Fairly rare  
3 - Fairly widespread  
4 - Very widespread  

Final variable:  
**RA03** - Discrete

In [17]:
# Check for missing values in variables
print("Number of missing values in RA03_1:", df_subset_job_seekers['RA03_1'].isnull().sum())
print("Number of missing values in RA03_2:", df_subset_job_seekers['RA03_2'].isnull().sum())
print("Number of missing values in RA03_3:", df_subset_job_seekers['RA03_3'].isnull().sum())


# Concatenate frequency tables horizontally
RA03_values = pd.concat([df_subset_job_seekers['RA03_1'].value_counts().rename('RA03_1_count'),
                          df_subset_job_seekers['RA03_2'].value_counts().rename('RA03_2_count'),
                          df_subset_job_seekers['RA03_3'].value_counts().rename('RA03_3_count')], axis = 1)

# Display horizontally
display(RA03_values)

# Create new variable RA03_1_mod based on conditions
df_subset_job_seekers.loc[:, 'RA03_1_mod'] = df_subset_job_seekers['RA03_1'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)
df_subset_job_seekers.loc[:, 'RA03_2_mod'] = df_subset_job_seekers['RA03_2'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)
df_subset_job_seekers.loc[:, 'RA03_3_mod'] = df_subset_job_seekers['RA03_3'].apply(lambda x: 0 if x in [5, 96, 97, 98, 99] else x)

# Concatenate frequency tables horizontally
RA03_mod_values = pd.concat([df_subset_job_seekers['RA03_1_mod'].value_counts().rename('RA03_1_mod_count'),
                             df_subset_job_seekers['RA03_2_mod'].value_counts().rename('RA03_2_mod_count'),
                             df_subset_job_seekers['RA03_3_mod'].value_counts().rename('RA03_3_mod_count')], axis = 1)

# Display horizontally
display(RA03_mod_values)


Number of missing values in RA03_1: 0
Number of missing values in RA03_2: 0
Number of missing values in RA03_3: 0


Unnamed: 0,RA03_1_count,RA03_2_count,RA03_3_count
3.0,3902,4211,2593
2.0,3286,3170,3006
1.0,3137,2771,4187
4.0,1974,2262,1845
5.0,908,774,1390
99.0,139,138,309
98.0,24,21,25
97.0,18,40,32
96.0,18,19,19


Unnamed: 0,RA03_1_mod_count,RA03_2_mod_count,RA03_3_mod_count
3.0,3902,4211,2593
2.0,3286,3170,3006
1.0,3137,2771,4187
4.0,1974,2262,1845
0.0,1107,992,1775


In [18]:
# Transform variable to remove correlation
df_subset_job_seekers.loc[:, 'RA03'] = df_subset_job_seekers[['RA03_1_mod', 'RA03_2_mod', 'RA03_3_mod']].sum(axis = 1)
print(df_subset_job_seekers['RA03'].value_counts())  # Print frequency table


RA03
3.0     2164
6.0     1924
9.0     1896
8.0     1245
7.0     1195
5.0      989
12.0     863
10.0     788
0.0      769
4.0      660
11.0     504
2.0      262
1.0      147
Name: count, dtype: int64


# V.

<span style="color:red">**Ability of household to make ends meet**</span>  
**SI06** - Thinking of your household's total income, is your household able to make ends meet?  

**SI06_mod** - Binary  
0 - No  
1 - Yes  

In [19]:
# Check for missing values in variable SI06
print("Number of missing values in SI06:", df_subset_job_seekers['SI06'].isnull().sum())
print(df_subset_job_seekers['SI06'].value_counts())  # Print frequency table

# Create new variable SI06_mod based on conditions
df_subset_job_seekers.loc[:, 'SI06_mod'] = df_subset_job_seekers['SI06'].apply(lambda x: 1 if x >= 4 and x <= 6 else 0) # fairly easily, easily or very easily
print(df_subset_job_seekers['SI06_mod'].value_counts())  # Print frequency table



Number of missing values in SI06: 0
SI06
1.0     3934
3.0     3192
2.0     2776
4.0     2102
5.0      945
6.0      262
96.0      95
99.0      93
97.0       7
Name: count, dtype: int64
SI06_mod
0    10097
1     3309
Name: count, dtype: int64


# VI.

<span style="color:red">**Housing**</span>  
HLS02 and DHO01  
**HLS02** - Do you own or rent this accommodation?  
**HLS02_mod** - Binary  
0 - Rent  
1 - Own  

In [20]:
# Check for missing values in variable HLS02
print("Number of missing values in HLS02:", df_subset_job_seekers['HLS02'].isnull().sum())
print(df_subset_job_seekers['HLS02'].value_counts())  # Print frequency table

# Create new variable HLS02_mod based on conditions
df_subset_job_seekers.loc[:, 'HLS02_mod'] = df_subset_job_seekers['HLS02'].apply(lambda x: 1 if x == 1 or x == 2 else 0) # own with or without mortgage
print(df_subset_job_seekers['HLS02_mod'].value_counts())  # Print frequency table



Number of missing values in HLS02: 2
HLS02
4.0     4212
3.0     4188
1.0     2667
2.0     1048
5.0      985
6.0      178
96.0      84
99.0      39
97.0       3
Name: count, dtype: int64
HLS02_mod
0    9691
1    3715
Name: count, dtype: int64


**DHO01** - In the past 5 years, have you ever tried to rent or buy an apartment or a house?  
**DHO01_mod** - Binary  
0 - No  
1 - Yes  

In [21]:
# Check for missing values in variable DHO01
print("Number of missing values in DHO01:", df_subset_job_seekers['DHO01'].isnull().sum())
print(df_subset_job_seekers['DHO01'].value_counts())  # Print frequency table

# Create new variable DHO01_mod based on condition
df_subset_job_seekers.loc[:, 'DHO01_mod'] = np.where(df_subset_job_seekers['DHO01'] == 1, 1, 0)
print(df_subset_job_seekers['DHO01_mod'].value_counts())  # Print frequency table


Number of missing values in DHO01: 0
DHO01
2.0     8148
1.0     5216
96.0      17
97.0      14
99.0      11
Name: count, dtype: int64
DHO01_mod
0    8190
1    5216
Name: count, dtype: int64


# VII.

<span style="color:red">**Urbanisation**</span>  
**DEGURBA** - degree of urbanisation  
**DEGURBA_mod** - Discrete  


In [22]:
# Check for missing values in variable DEGURBA
print("Number of missing values in DEGURBA:", df_subset_job_seekers['DEGURBA'].isnull().sum())
print(df_subset_job_seekers['DEGURBA'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DEGURBA = {1:3, 3:1}

# Create new variable DEGURBA_mod based on the mapping
df_subset_job_seekers.loc[:, 'DEGURBA_mod'] = df_subset_job_seekers['DEGURBA'].map(map_DEGURBA).fillna(df_subset_job_seekers['DEGURBA'])
print(df_subset_job_seekers['DEGURBA_mod'].value_counts())


Number of missing values in DEGURBA: 0
DEGURBA
1.0    8233
2.0    2789
3.0    2384
Name: count, dtype: int64
DEGURBA_mod
3.0    8233
2.0    2789
1.0    2384
Name: count, dtype: int64


# VIII.

<span style="color:red">**Health**</span>  
DHE01, DHE02, DHE03, DHE04

**DHE01** - How is your health in general?  
**DHE01_mod** - Discrete  

1 - Very good  
2 - Good  
3 - Fair  
4 - Bad  
5 - Very bad  

In [23]:
# Check for missing values in variable DHE01
print("Number of missing values in DHE01:", df_subset_job_seekers['DHE01'].isnull().sum())
print(df_subset_job_seekers['DHE01'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DHE01 = {96: 0, 97: 0, 99: 0, 1: 5, 5: 1, 2: 4, 4: 2}

# Create new variable DHE01_mod based on the mapping
df_subset_job_seekers.loc[:, 'DHE01_mod'] = df_subset_job_seekers['DHE01'].map(map_DHE01).fillna(df_subset_job_seekers['DHE01'])
print(df_subset_job_seekers['DHE01_mod'].value_counts())  # Print frequency table
# Growing from non-existing answer (0) and very bad health (1) to very good health (5)

Number of missing values in DHE01: 0
DHE01
1.0     5642
2.0     5230
3.0     1807
4.0      596
5.0      120
96.0       7
99.0       3
97.0       1
Name: count, dtype: int64
DHE01_mod
5.0    5642
4.0    5230
3.0    1807
2.0     596
1.0     120
0.0      11
Name: count, dtype: int64


**DHE02** - Do you have any longstanding illness or health problem?  
**DHE02_mod** - Binary  
0 - No  
1 - Yes  

In [24]:
# Check for missing values in variable DHE02
print("Number of missing values in DHE02:", df_subset_job_seekers['DHE02'].isnull().sum())
print(df_subset_job_seekers['DHE02'].value_counts())  # Print frequency table

# Create new variable DHE02_mod based on condition
df_subset_job_seekers.loc[:, 'DHE02_mod'] = np.where(df_subset_job_seekers['DHE02'] == 1, 1, 0)
print(df_subset_job_seekers['DHE02_mod'].value_counts())  # Print frequency table

Number of missing values in DHE02: 0
DHE02
2.0     11299
1.0      2033
96.0       48
99.0       20
97.0        6
Name: count, dtype: int64
DHE02_mod
0    11373
1     2033
Name: count, dtype: int64


**DHE03** - For at least the past six months, to what extent have you been limited because of a health problem in activities people usually do?  
**DHE03_mod** - Discrete  
1 - Severely limited  
2 - Limited but not severely  
3 - Not limited at all  

In [25]:
# Check for missing values in variable DHE03
print("Number of missing values in DHE03:", df_subset_job_seekers['DHE03'].isnull().sum())
print(df_subset_job_seekers['DHE03'].value_counts())  # Print frequency table

# Define the mapping dictionary for the new variable
map_DHE03 = {96:3, 97:3, 99:3}

# Create new variable DHE03_mod based on the mapping
df_subset_job_seekers.loc[:, 'DHE03_mod'] = df_subset_job_seekers['DHE03'].map(map_DHE03).fillna(df_subset_job_seekers['DHE03'])
print(df_subset_job_seekers['DHE03_mod'].value_counts())  # Print frequency table

Number of missing values in DHE03: 0
DHE03
3.0     11180
2.0      1638
1.0       521
96.0       35
99.0       22
97.0       10
Name: count, dtype: int64
DHE03_mod
3.0    11247
2.0     1638
1.0      521
Name: count, dtype: int64


**DHE04** - Does the [NATIONAL BASIC HEALTH INSURANCE SCHEME] currently cover your health care expenses?  
**DHE04_mod** - Binary  
0 - No  
1 - Yes  

In [26]:
# Check for missing values in variable DHE04
print("Number of missing values in DHE04:", df_subset_job_seekers['DHE04'].isnull().sum())
print(df_subset_job_seekers['DHE04'].value_counts())  # Print frequency table

# Create new variable DHE04_mod based on condition
df_subset_job_seekers.loc[:, 'DHE04_mod'] = np.where(df_subset_job_seekers['DHE04'] == 1, 1, 0)
print(df_subset_job_seekers['DHE04_mod'].value_counts())  # Print frequency table

Number of missing values in DHE04: 0
DHE04
1.0     11009
2.0      1979
99.0      299
97.0       76
96.0       43
Name: count, dtype: int64
DHE04_mod
1    11009
0     2397
Name: count, dtype: int64


# IX.

<span style="color:red">**Awareness of experiences of violence among friends and family because of ethnic minority background in past 12 months**</span>  
**VV10_1** - Have you heard of anyone in your circle of family or friends being insulted or called names?  
**VV10_2** - Have you heard of anyone in your circle of family or friends being physically attacked?  
**VV10_1_mod, VV10_2_mod** - Binary  
0 - No  
1 - Yes  

Final variable:  
**VV10** - Discrete (0, 1, 2)

In [27]:
# Check for missing values in variable VV10_1
print("Number of missing values in VV10_1:", df_subset_job_seekers['VV10_1'].isnull().sum())
print("Number of missing values in VV10_2:", df_subset_job_seekers['VV10_2'].isnull().sum())

# Print frequency table
VV10_values = pd.concat([df_subset_job_seekers['VV10_1'].value_counts().rename('VV10_1'),
                         df_subset_job_seekers['VV10_2'].value_counts().rename('VV10_2')], axis = 1)

# Display horizontally
display(VV10_values)

# Create new variables based on conditions
df_subset_job_seekers.loc[:, 'VV10_1_mod'] = df_subset_job_seekers['VV10_1'].apply(lambda x: 1 if x in [1] else 0)
df_subset_job_seekers.loc[:, 'VV10_2_mod'] = df_subset_job_seekers['VV10_2'].apply(lambda x: 1 if x in [1] else 0)

# Print frequency table
VV10_mod_values = pd.concat([df_subset_job_seekers['VV10_1_mod'].value_counts().rename('VV10_1_mod'),
                             df_subset_job_seekers['VV10_2_mod'].value_counts().rename('VV10_2_mod')], axis = 1)

# Display horizontally
display(VV10_mod_values)



Number of missing values in VV10_1: 0
Number of missing values in VV10_2: 0


Unnamed: 0,VV10_1,VV10_2
2.0,8833,11277
1.0,4194,1744
99.0,291,288
96.0,43,51
98.0,37,37
97.0,8,9


Unnamed: 0,VV10_1_mod,VV10_2_mod
0,9212,11662
1,4194,1744


In [28]:
# Transform variable to remove correlation
df_subset_job_seekers.loc[:, 'VV10'] = df_subset_job_seekers[['VV10_1_mod', 'VV10_2_mod']].sum(axis = 1)
print(df_subset_job_seekers['VV10'].value_counts())  # Print frequency table


VV10
0    9052
1    2770
2    1584
Name: count, dtype: int64


# X.

<span style="color:red">**Awareness of support organisations in country**</span>  
**RA04** - Do you know of any organisations in [COUNTRY] that offer support or advice to people who have been discriminated against - for whatever reason?  
**RA04_mod** - Binary  
0 - No  
1 - Yes

In [29]:
# Check for missing values in variable RA04
print("Number of missing values in RA04:", df_subset_job_seekers['RA04'].isnull().sum())
print(df_subset_job_seekers['RA04'].value_counts())  # Print frequency table

# Create new variable RA04_mod based on condition
df_subset_job_seekers.loc[:, 'RA04_mod'] = np.where(df_subset_job_seekers['RA04'] == 1, 1, 0)
print(df_subset_job_seekers['RA04_mod'].value_counts())  # Print frequency table


Number of missing values in RA04: 0
RA04
2.0     10103
1.0      2903
99.0      348
97.0       39
96.0       13
Name: count, dtype: int64
RA04_mod
0    10503
1     2903
Name: count, dtype: int64


# XI.

<span style="color:red">**Interethnic relationships + neighbourhood**</span>  
**PB10_1, PB10_2, PB11**  
  
**PB10_1** - Do you have friends who are of another ethnic minority background than you?  
**PB10_1_mod** - Binary (0 - No, 1 - Yes)  

In [30]:
# Check for missing values in variable PB10_1
print("Number of missing values in PB10_1:", df_subset_job_seekers['PB10_1'].isnull().sum())
print(df_subset_job_seekers['PB10_1'].value_counts())  # Print frequency table

# Create new variable PB10_1_mod based on condition
df_subset_job_seekers.loc[:, 'PB10_1_mod'] = np.where(df_subset_job_seekers['PB10_1'] == 1, 1, 0)
print(df_subset_job_seekers['PB10_1_mod'].value_counts())

Number of missing values in PB10_1: 0
PB10_1
1.0     9798
2.0     3432
97.0      80
99.0      72
96.0      24
Name: count, dtype: int64
PB10_1_mod
1    9798
0    3608
Name: count, dtype: int64


**PB10_2** - Do you have friends who do not have a minority background?  
**PB10_2_mod** - Binary (0 - No, 1 - Yes)  
  

In [31]:
# Check for missing values in variable PB10_2
print(df_subset_job_seekers['PB10_2'].value_counts())  # Print frequency table

# Create new variable PB10_2_mod based on condition
df_subset_job_seekers.loc[:, 'PB10_2_mod'] = np.where(df_subset_job_seekers['PB10_2'] == 1, 1, 0)
print(df_subset_job_seekers['PB10_2_mod'].value_counts())  # Print frequency table


PB10_2
1.0     10308
2.0      2873
97.0       98
99.0       97
96.0       30
Name: count, dtype: int64
PB10_2_mod
1    10308
0     3098
Name: count, dtype: int64


**PB11** - In the neighbourhood where you live how many of the residents would you say are [ethnic or immigrant/Roma/ethnic minority (tailored to target group)] background as you: all of the residents most of them some or none of them?  
**PB11_mod** - Is the majority of your neighbourhood from the same ethnic background as you?  
Binary (0 - No, 1 - Yes)  

In [32]:
# Check for missing values in variable PB11
print("Number of missing values in PB11:", df_subset_job_seekers['PB11'].isnull().sum())
print(df_subset_job_seekers['PB11'].value_counts())  # Print frequency table

# New variable: is the majority of your neighbourhood from the same ethnic background as you?
# Create new variable PB11_mod based on conditions
df_subset_job_seekers.loc[:, 'PB11_mod'] = df_subset_job_seekers['PB11'].apply(lambda x: 1 if x == 1 or x == 2 else 0) # all or most of the neighbourhood
print(df_subset_job_seekers['PB11_mod'].value_counts())  # Print frequency table


Number of missing values in PB11: 0
PB11
3.0     6624
2.0     4482
1.0     1304
4.0      671
99.0     288
97.0      20
96.0      17
Name: count, dtype: int64
PB11_mod
0    7620
1    5786
Name: count, dtype: int64


# XII.

<span style="color:red">**Social transfers + poverty**</span>  
**arop** - At risk of poverty after social transfers  
**SI01_04** - Income from education allowance (grants, stipends)  <------------  
**SI01_05** - Pensions                                            <------------  
**SI01_06** - Unemployment benefits                               <------------  
**SI01_07** - Child benefits (including alimonies)                <------------  
**SI01_08** - Other social benefits (social assistance, rent support, donations from charity) <------------  

In [33]:
# Check for missing values in variable SI01_04
print("Number of missing values in SI01_04:", df_subset_job_seekers['SI01_04'].isnull().sum())
print("Number of missing values in SI01_05:", df_subset_job_seekers['SI01_05'].isnull().sum())
print("Number of missing values in SI01_06:", df_subset_job_seekers['SI01_06'].isnull().sum())
print("Number of missing values in SI01_07:", df_subset_job_seekers['SI01_07'].isnull().sum())
print("Number of missing values in SI01_08:", df_subset_job_seekers['SI01_08'].isnull().sum())

# Concatenate frequency tables horizontally
SI01_values = pd.concat([df_subset_job_seekers['SI01_04'].value_counts().rename('SI01_04_count'),
                         df_subset_job_seekers['SI01_05'].value_counts().rename('SI01_05_count'),
                         df_subset_job_seekers['SI01_06'].value_counts().rename('SI01_06_count'),
                         df_subset_job_seekers['SI01_07'].value_counts().rename('SI01_07_count'),
                         df_subset_job_seekers['SI01_08'].value_counts().rename('SI01_08_count')], axis = 1)

# Display horizontally
display(SI01_values)

# Create new variable SI01_mod based on conditions (Received social security transfer - Yes(1) or No(0))
df_subset_job_seekers.loc[:, 'SI01_04_mod'] = df_subset_job_seekers['SI01_04'].apply(lambda x: 1 if x in [1] else 0)
df_subset_job_seekers.loc[:, 'SI01_05_mod'] = df_subset_job_seekers['SI01_05'].apply(lambda x: 1 if x in [1] else 0)
df_subset_job_seekers.loc[:, 'SI01_06_mod'] = df_subset_job_seekers['SI01_06'].apply(lambda x: 1 if x in [1] else 0)
df_subset_job_seekers.loc[:, 'SI01_07_mod'] = df_subset_job_seekers['SI01_07'].apply(lambda x: 1 if x in [1] else 0)
df_subset_job_seekers.loc[:, 'SI01_08_mod'] = df_subset_job_seekers['SI01_08'].apply(lambda x: 1 if x in [1] else 0)


# Concatenate frequency tables horizontally
SI01_mod_values = pd.concat([df_subset_job_seekers['SI01_04_mod'].value_counts().rename('SI01_04_mod_count'),
                             df_subset_job_seekers['SI01_05_mod'].value_counts().rename('SI01_05_mod_count'),
                             df_subset_job_seekers['SI01_06_mod'].value_counts().rename('SI01_06_mod_count'),
                             df_subset_job_seekers['SI01_07_mod'].value_counts().rename('SI01_07_mod_count'),
                             df_subset_job_seekers['SI01_08_mod'].value_counts().rename('SI01_08_mod_count')], axis = 1)

# Display horizontally
display(SI01_mod_values)

Number of missing values in SI01_04: 2
Number of missing values in SI01_05: 2
Number of missing values in SI01_06: 2
Number of missing values in SI01_07: 2
Number of missing values in SI01_08: 2


Unnamed: 0,SI01_04_count,SI01_05_count,SI01_06_count,SI01_07_count,SI01_08_count
2.0,11541,12161,10033,7633,8642
1.0,1759,1144,3251,5657,4583
96.0,54,59,57,48,64
99.0,38,37,55,58,93
97.0,12,3,8,8,22


Unnamed: 0,SI01_04_mod_count,SI01_05_mod_count,SI01_06_mod_count,SI01_07_mod_count,SI01_08_mod_count
0,11647,12262,10155,7749,8823
1,1759,1144,3251,5657,4583


In [34]:
# Transform variable to remove correlation
df_subset_job_seekers.loc[:, 'SI01'] = df_subset_job_seekers[['SI01_04_mod', 'SI01_05_mod', 'SI01_06_mod', 'SI01_07_mod', 'SI01_08_mod']].sum(axis = 1)
print(df_subset_job_seekers['SI01'].value_counts())  # Print frequency table

SI01
1    4585
0    3903
2    3240
3    1404
4     253
5      21
Name: count, dtype: int64


**arop** - At risk of poverty after social transfers  

In [35]:
# Check for missing values in variable arop
print("Number of missing values in arop:", df_subset_job_seekers['arop'].isnull().sum())
print(df_subset_job_seekers['arop'].value_counts())  # Print frequency table


Number of missing values in arop: 2700
arop
1.0    7411
0.0    3295
Name: count, dtype: int64


In [36]:
# Simple imputation with mode:
# Calculate the most frequent value
most_frequent_arop = df_subset_job_seekers['arop'].mode()[0]

# Create a new variable for imputed values with the most frequent value
df_subset_job_seekers.loc[:, 'arop_mod_1'] = df_subset_job_seekers['arop'].fillna(value = most_frequent_arop)
print(df_subset_job_seekers['arop_mod_1'].value_counts())  # Print frequency table


arop_mod_1
1.0    10111
0.0     3295
Name: count, dtype: int64


In [37]:
# Imputation using other variables:
# Taking the reference of Macedonia as the country with the lowest minimum wage at 360 euros per month (band 8 in our data)
# Define a function to impute 'arop' based on 'SI03_3_H'
def impute_arop(row):
    if pd.isna(row['arop']):  # Check if 'arop' is missing
        if row['SI03_3_H'] >= 8:  # Check if 'SI03_3_H' is greater than or equal to 8
            return 0  # Impute 0 for 'arop'
        else:
            return 1  # Impute 1 for 'arop'
    else:
        return row['arop']  # Return original value if not missing

# Apply the function to impute 'arop'
df_subset_job_seekers.loc[:, 'arop_mod_2'] = df_subset_job_seekers.apply(impute_arop, axis=1)
print(df_subset_job_seekers['arop_mod_2'].value_counts())  # Print frequency table


arop_mod_2
1.0    7612
0.0    5794
Name: count, dtype: int64


# XIII.

<span style="color:red">**Citizenship Status**</span>  
  
**Reasons for coming to country** (**asked only to non-EU/non-EFTA candidates**)  
**PR01_01, PR01_02, PR01_03, PR01_04, PR01_05, PR01_06, PR01_0, PR01_96, PR01_97, PR01_99** - Will be used for imputation only  

**Residence**  
**res_stat** - Residence and citizenship status  
**res_stat_mod_1** - simple imputation  
**res_stat_mod_2** - imputation using other variables

In [38]:
# Check for missing values in variable res_stat
print("Number of missing values in res_stat:", df_subset_job_seekers['res_stat'].isnull().sum())
print(df_subset_job_seekers['res_stat'].value_counts())  # Print frequency table


Number of missing values in res_stat: 3987
res_stat
1.0     4196
5.0     2299
3.0     1417
99.0     629
6.0      366
2.0      288
4.0      224
Name: count, dtype: int64


In [39]:
# Simple imputation (0 - Non-EU citizen, 1 - EU citizen)
df_subset_job_seekers.loc[:, 'res_stat_mod_1'] = df_subset_job_seekers['res_stat'].apply(lambda x: 1 if x in [1, 2] else 0)
print(df_subset_job_seekers['res_stat_mod_1'].value_counts())  # Print frequency table

res_stat_mod_1
0    8922
1    4484
Name: count, dtype: int64


In [40]:
eu_countries = list(range(1, 28))  # Generates a list of integers from 1 to 27

def is_eu_citizen_2(citizenship):
    return 0 if citizenship not in eu_countries else 1  # 1 for eu, 0 for others
    
# Apply the function to create a new column indicating if the citizenship is in EU countries
df_subset_job_seekers.loc[:, 'is_eu_citizen_2'] = df_full['HH07_3'].apply(is_eu_citizen_2)
print(df_subset_job_seekers['is_eu_citizen_2'].value_counts())

is_eu_citizen_2
0    13135
1      271
Name: count, dtype: int64


In [41]:
# Imputation using other variables
df_subset_job_seekers.loc[:, 'is_eu_reason'] = df_subset_job_seekers[['PR01_01', 'PR01_02', 'PR01_03',
                                                                      'PR01_04', 'PR01_05', 'PR01_06',
                                                                      'PR01_07', 'PR01_96', 'PR01_97',
                                                                      'PR01_99']].isna().any(axis=1).astype(int)
df_subset_job_seekers['is_eu_reason'].value_counts()



is_eu_reason
0    6999
1    6407
Name: count, dtype: int64

In [42]:
df_subset_job_seekers['res_stat_mod_2'] = df_subset_job_seekers['res_stat'].fillna(df_subset_job_seekers['is_eu_citizen_2'])
df_subset_job_seekers['res_stat_mod_2'] = df_subset_job_seekers['res_stat'].fillna(df_subset_job_seekers['is_eu_reason'])

df_subset_job_seekers['res_stat_mod_2'].isnull().sum()
df_subset_job_seekers['res_stat_mod_2'].value_counts()

res_stat_mod_2
1.0     8183
5.0     2299
3.0     1417
99.0     629
6.0      366
2.0      288
4.0      224
Name: count, dtype: int64

In [43]:
df_subset_job_seekers.loc[:, 'res_stat_mod_2'] = df_subset_job_seekers['res_stat_mod_2'].apply(lambda x: 1 if x in [1, 2] else 0)
print(df_subset_job_seekers['res_stat_mod_2'].value_counts())  # Print frequency table

res_stat_mod_2
1.0    8471
0.0    4935
Name: count, dtype: int64


# XIV.

<span style="color:red">**Institutional experiences**</span>  
**DO27** - In the past 5 years in [COUNTRY] (or since you have been in [country]) have you ever been stopped searched or questioned by the police?  

In [44]:
# Check for missing values in variable DO27
print("Number of missing values in DO27:", df_subset_job_seekers['DO27'].isnull().sum())
print(df_subset_job_seekers['DO27'].value_counts())  # Print frequency table

# Create new variable DO27_mod based on condition
df_subset_job_seekers.loc[:, 'DO27_mod'] = np.where(df_subset_job_seekers['DO27'] == 1, 1, 0)
print(df_subset_job_seekers['DO27_mod'].value_counts())  # Print frequency table


Number of missing values in DO27: 0
DO27
2.0     9039
1.0     4265
96.0      89
99.0       8
97.0       5
Name: count, dtype: int64
DO27_mod
0    9141
1    4265
Name: count, dtype: int64


In [45]:
# Get all the column names and check for missing values again
df_subset_job_seekers.isnull().sum()

id                            0
country                       0
HH02                          0
HH03                          0
HH04                          0
IN02                          0
SI03_3_H                      0
SI03_2_H_stat                 0
PB01                          0
S01                           0
VH01a_1                       0
VH01a_2                       0
VH01a_3                       0
VH01a_4                       0
VH01a_5                       0
RA03_1                        0
RA03_2                        0
RA03_3                        0
SI06                          0
HLS02                         2
DEGURBA                       0
DHE01                         0
DHE02                         0
DHE03                         0
DHE04                         0
DHO01                         0
VV10_1                        0
VV10_2                        0
RA04                          0
PB10_1                        0
PB10_2                        0
PB11    

# **Export Clean Data:**

# Part 1: Clean Dataset 1 (with some transformation)

In [46]:
# Select necessary columns
clean_data_1 = df_subset_job_seekers[['discrimination_occurred', 'id', 'country', 'HH02', 'HH03', 'HH04', 'IN02', 'SI03_3_H', 'SI03_2_H_stat',
                                      'PB01_mod', 'S01_mod', 'VH01a_1_mod', 'VH01a_2_mod', 'VH01a_3_mod', 'VH01a_4_mod', 'VH01a_5_mod',
                                      'RA03_1_mod', 'RA03_2_mod', 'RA03_3_mod', 'SI06_mod', 'HLS02_mod', 'DHO01_mod', 'DEGURBA_mod',
                                      'DHE01_mod', 'DHE02_mod', 'DHE03_mod', 'DHE04_mod', 'VV10_1_mod', 'VV10_2_mod', 'RA04_mod',
                                      'PB10_1_mod', 'PB10_2_mod', 'PB11_mod', 'SI01_04_mod', 'SI01_05_mod', 'SI01_06_mod', 'SI01_07_mod',
                                      'SI01_08_mod', 'arop_mod_1', 'res_stat_mod_1', 'DO27_mod']]
# Check the dimensions
clean_data_1.shape

(13406, 41)

In [47]:
# Export selected columns to a CSV file
clean_data_1.to_csv('clean_data_1.csv', index = False)

# Part 2: Clean Dataset 2 (with more transformation)

In [48]:
# Select necessary columns
clean_data_2 = df_subset_job_seekers[['discrimination_occurred', 'id', 'country', 'HH02', 'HH03', 'HH04', 'IN02', 'SI03_3_H', 'SI03_2_H_stat',
                                      'PB01_mod', 'S01_mod', 'VH01a', 'RA03', 'SI06_mod', 'HLS02_mod', 'DHO01_mod', 'DEGURBA_mod',
                                      'DHE01_mod', 'DHE02_mod', 'DHE03_mod', 'DHE04_mod', 'VV10', 'RA04_mod', 'PB10_1_mod', 'PB10_2_mod',
                                      'PB11_mod', 'SI01', 'arop_mod_2', 'res_stat_mod_2', 'DO27_mod']]

# Check the dimensions
clean_data_2.shape

(13406, 30)

In [49]:
# Export selected columns to a CSV file
clean_data_2.to_csv('clean_data_2.csv', index = False)

# **Rename columns for more clarity:**

In [50]:
# Define a dictionary with the current column names as keys and the new names as values
data_2_new_names = {
    'id': 'respodent_id',
    'country': 'interview_country',
    'HH02': 'age',
    'HH03': 'gender',
    'HH04': 'job_situation',
    'IN02': 'ethnic_group',
    'SI03_3_H': 'household_income',
    'SI03_2_H_stat': 'income_imputed',
    'PB01_mod': 'religion',
    'S01_mod': 'marital status',
    'VH01a': 'past_discrim_exp',
    'RA03': 'extent_of_discrim',
    'SI06_mod': 'can_make_ends_meet',
    'HLS02_mod': 'rent_or_own_housing',
    'DHO01_mod': 'searched_for_housing',
    'DEGURBA_mod': 'urbanization_degree',
    'DHE01_mod': 'health_status',
    'DHE02_mod': 'has_illness',
    'DHE03_mod': 'health_impact_on_life',
    'DHE04_mod': 'insurance_coverage',
    'VV10': 'violence_exp_by_family',
    'RA04_mod': 'aware_of_support_org',
    'PB10_1_mod': 'friends_from_diff_ethnicity',
    'PB10_2_mod': 'friends_from_majority',
    'PB11_mod': 'neighbour_ethnicity',
    'SI01': 'social_transfers',
    'arop_mod_2': 'poverty_risk',
    'res_stat_mod_2': 'citizenship_status',
    'DO27_mod': 'stopped_by_police'    
}

# Rename the columns using the dictionary
clean_data_2.rename(columns = data_2_new_names, inplace = True)
clean_data_2.head()

Unnamed: 0,discrimination_occurred,respodent_id,interview_country,age,gender,job_situation,ethnic_group,household_income,income_imputed,religion,...,insurance_coverage,violence_exp_by_family,aware_of_support_org,friends_from_diff_ethnicity,friends_from_majority,neighbour_ethnicity,social_transfers,poverty_risk,citizenship_status,stopped_by_police
0,1,165.0,1.0,37.0,1.0,1.0,8.0,15.0,0.0,0.0,...,1,1,0,1,1,0,2,1.0,0.0,1
12,0,317.0,1.0,30.0,1.0,1.0,8.0,19.0,0.0,2.0,...,1,1,1,0,1,1,3,0.0,0.0,1
16,0,321.0,1.0,35.0,1.0,1.0,8.0,17.0,0.0,0.0,...,1,0,1,1,1,0,0,0.0,0.0,1
17,1,322.0,1.0,42.0,1.0,4.0,8.0,14.0,0.0,2.0,...,1,2,1,1,1,0,2,1.0,0.0,1
22,0,327.0,1.0,39.0,1.0,5.0,8.0,10.0,0.0,1.0,...,1,0,0,1,0,0,0,1.0,0.0,1


In [51]:
clean_data_2.isnull().sum()

discrimination_occurred        0
respodent_id                   0
interview_country              0
age                            0
gender                         0
job_situation                  0
ethnic_group                   0
household_income               0
income_imputed                 0
religion                       0
marital status                 0
past_discrim_exp               0
extent_of_discrim              0
can_make_ends_meet             0
rent_or_own_housing            0
searched_for_housing           0
urbanization_degree            0
health_status                  0
has_illness                    0
health_impact_on_life          0
insurance_coverage             0
violence_exp_by_family         0
aware_of_support_org           0
friends_from_diff_ethnicity    0
friends_from_majority          0
neighbour_ethnicity            0
social_transfers               0
poverty_risk                   0
citizenship_status             0
stopped_by_police              0
dtype: int

In [52]:
clean_data_2.shape

(13406, 30)

In [53]:
# Export selected columns to a CSV file
clean_data_2.to_csv('clean_data_2_renamed.csv', index = False)