In [None]:
# pip install xgboost

In [20]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import pyreadr
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

## 1. Data Preparation

In [21]:
# Read the database
result = pyreadr.read_r('../01_data/minorities_discrimination_survey.RData')  
print(result.keys())  

odict_keys(['min_dis'])


In [4]:
# Activate automatic conversion from R to pandas
df1 = result["min_dis"] # extract the pandas data frame for object df1

In [5]:
df1.head()

Unnamed: 0,za_nr,version,doi,country,DEGURBA,ALTURBA,C1,C2,C3,typint,...,dis5_4_other,dis12overall10,dis5overall9,redisOverall,res_stat,sec_res,SI03_2_H_stat,SI03_3_H,pweightadj,hweightadj
0,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,1.0,1.0,,3.0,1.0,0.0,15.0,0.075758,0.115287
1,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,2.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,15.0,,0.115287
2,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,1.0,1.0,,5.0,2.0,0.0,16.0,0.075758,0.057643
3,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643
4,6703.0,1.0.0 (2020-07-29),doi:10.4232/1.13514,1.0,1.0,,1.0,3.0,2.0,1.0,...,0.0,,0.0,,99.0,2.0,0.0,16.0,,0.057643


In [10]:
# Count missing values in the subset dataframe
df1.isnull().sum()

za_nr                      0
version                    0
doi                        0
country                    0
DEGURBA                    0
ALTURBA                43030
C1                     10860
C2                     11767
C3                     10860
typint                     0
IN02                       0
IN03                       0
numints                    0
IN05                   48074
IN06                   76823
IN07                   36764
IN08                   36764
IN10                   74831
resp                       0
Generation             49322
HH01                       0
HH02                       0
HH02_1                 77538
HH03                       0
HH04                   22450
HH05                   55969
HH06                   54841
HH07_1                 60088
HH07_2                 45822
HH07_3                 68030
HH07_4                 77574
HH08                   61109
HH09_1                 55477
HH10                   55477
HH10a         

## Subset

In [12]:
# Filter the subset to include only people seeking for a job (EU05=1 or EU07=1)
df_subset_job_seekers = df1[(df1['EU05'] == 1) | (df1['EU07'] == 1)]

In [23]:
# Count the number of observations before filtering
before_filter_count = df1.shape[0]

# Count the number of observations after filtering
after_filter_count = df_subset_job_seekers.shape[0]

# Print the counts
print("Number of observations before filtering:", before_filter_count)
print("Number of observations after filtering for job seekers:", after_filter_count)

round((df_subset_job_seekers.shape[0] / df1.shape[0])*100, 1)

Number of observations before filtering: 77656
Number of observations after filtering for job seekers: 13406


17.3

## Target variable

In [54]:
# Define the target variable
discrimination_columns = ['EUD01_01', 'EUD01_02', 'EUD01_03', 'EUD01_04', 'EUD01_05', 'EUD01_06', 
                          'EUD01_07', 'EUD01_08']

# Define a function to count the discrimination level
def count_discrimination_level(row):
    discrimination_count = 0
    for col in discrimination_columns:
        # Check if discrimination occurred in each column 
        if row[col] == 1:
            discrimination_count += 1
    return discrimination_count

# Create a copy of the DataFrame
df_subset_copy = df_subset_job_seekers.copy()

# Create the discrimination level variable
df_subset_copy['discrimination_level'] = df_subset_copy.apply(count_discrimination_level, axis=1)

# Display the distribution of the discrimination level
print(df_subset_copy['discrimination_level'].value_counts().sort_index())

discrimination_level
0    8236
1    3021
2    1636
3     419
4      78
5      16
Name: count, dtype: int64


## Split the data set

In [56]:
# Select variables of interest 
db_filtered = [
    'country',
    'country2',
    'DEGURBA',
    'IN02', # Target group
    'IN05', # COUNTRY OF BIRTH (immigrants and their descendants)
    'IN06', # COUNTRY OF BIRTH (Recent immigrants)
    'Generation', # 1st or 2nd generation migrant
    'HH02', # Age
    'HH03', # Gender
    'HH04', # Current situation (Job)
    'HH05', # Is that full-time, part-time or just occasionally?
    'HH06', # Year arrived to live in country
    'HH09_1', # Highest level of education completed? RECODE
    'HH10', # Years spent in education?
    'HH10a', # Currently attending school or vocational training?
    'HH10c', # What is the main reason why you did not continue at school?
    'HLS02', # Do you own or rent this accommodation?
    'RA03_1', # Prevalance of discrimination on the basis of skin colour in [country]
    'RA03_2', # Prevalance of discrimination on the basis of ethnic origin or immigrant background in [country]
    'RA03_3', # Prevalance of discrimination on the basis of religion or religious beliefs in [country]
    'RA04', # Do you know of any organisations in [COUNTRY] that offer support or advice to people who have been discriminated against - for whatever reason?
    'EU01', # In what year was the last time you were in work?
    'EU02', # What was your last job or occupation?
    'EU03', # Are you currently registered as unemployed? 
    'EU05', # Are you currently looking for work?
    'EU07', # Have you ever looked for work in the past 5 years in  [COUNTRY] (or since you have been in [country])?
    'EUD01_01', ### Discriminated when looking for work in the past 5 years: Skin colour 
    'EUD01_02', ### Discriminated when looking for work in the past 5 years:  [ethnic origin or immigrant background / ethnic origin (tailored to target group)] 
    'EUD01_03', ### Discriminated when looking for work in the past 5 years: Religion or religious beliefs 
    'EUD01_04', ### Discriminated when looking for work in the past 5 years: Age (such as being too young or too old) 
    'EUD01_05', ### Discriminated when looking for work in the past 5 years: Sex/gender (such as being a man or a woman) 
    'EUD01_06', ### Discriminated when looking for work in the past 5 years: Disability 
    'EUD01_07', ### Discriminated when looking for work in the past 5 years: Sexual orientation (such as being gay lesbian or bisexual) 
    'EUD01_08', ### Discriminated when looking for work in the past 5 years: Other (please specify) 
    'EUD01_09', ### Discriminated when looking for work in the past 5 years: I haven't felt discriminated against on any ground when looking for work in the past 5 years
    'EUD01_96', ### Discriminated when looking for work in the past 5 years: Refused  
    'EUD01_97', ### Discriminated when looking for work in the past 5 years: Doesn't understand the question 
    'EUD01_99', ### Discriminated when looking for work in the past 5 years: Don't know  
    'EUD06_01', ### Report or complaint made to Police  
    'EA01', ### What is your current job or occupation?
    'EA02', ### "What kind of employment contract do you have in your main job? Is it a…?
    'EA04', ### To what extent does your current job or the work you do correspond to your level of education?
    'EA05', ### How many hours per week do you work in your (main) job? (allow from 0 to 999)
    'EAD01_01', ### Discriminated at work in the past 5 years: Skin colour 
    'EAD01_02', ### Discriminated at work in the past 5 years: [ethnic origin or immigrant background / ethnic origin (tailored to target group)] 
    'EAD01_03', ### Discriminated at work in the past 5 years: Religion or religious beliefs 
    'EAD01_04', ### Discriminated at work in the past 5 years: Age 
    'EAD01_05', ### Discriminated at work in the past 5 years: Sex/gender (such as being a man or a woman) 
    'EAD01_06', ### Discriminated at work in the past 5 years: Disability
    'EAD01_07', ### Discriminated at work in the past 5 years: Sexual orientation  (such as being gay, lesbian or bisexual)
    'EAD01_08', ### Discriminated at work in the past 5 years: Other (please specify)
    'EAD01_09', ### Discriminated at work in the past 5 years: I haven't felt discriminated against on any ground when at work in the past 5 years 
    'EAD01_96', ### Discriminated at work in the past 5 years: Refused  
    'EAD01_97', ### Discriminated at work in the past 5 years: Doesn't understand the question
    'EAD01_99', ### Discriminated at work in the past 5 years: Don't know
    'res_stat', ### Residence and citizenship status
    'id', # unique identification number for a respondent
    'arop', # At risk of poverty after social transfers
    'PB06_01', # What language do you mainly speak at home? Language 1
    'PB04', # Do you usually wear a headscarf or niqab outside the house? 
    'PB01', # What is your religion?

]

In [58]:
# Subset the dataframe with selected variables
df_subset_copy = df_subset_copy[db_filtered]

# Perform exploratory data analysis on the subset dataframe

df_subset_copy.describe()

Unnamed: 0,country,country2,DEGURBA,IN02,IN05,IN06,Generation,HH02,HH03,HH04,...,EAD01_08,EAD01_09,EAD01_96,EAD01_97,EAD01_99,res_stat,id,arop,PB04,PB01
count,13406.0,13406.0,13406.0,13406.0,8630.0,439.0,9069.0,13406.0,13406.0,13406.0,...,10555.0,10555.0,10555.0,10555.0,10555.0,9419.0,13406.0,10706.0,2342.0,13406.0
mean,14.800462,14.025138,1.563703,5.229077,109.799073,149.225513,1.22825,34.876399,1.420558,3.414068,...,0.015348,0.682994,0.031454,0.002084,0.010422,9.117847,33630.028495,0.692229,3.489752,4.311279
std,8.272275,8.042699,0.77566,2.167088,71.731662,73.269701,0.419728,11.58427,0.493667,4.416784,...,0.122939,0.465332,0.17455,0.045609,0.101558,24.108913,21088.958303,0.461593,11.78541,16.596859
min,1.0,1.0,1.0,1.0,1.0,29.0,1.0,16.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
25%,9.0,7.0,1.0,3.0,31.0,51.0,1.0,26.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,17049.0,0.0,1.0,1.0
50%,15.0,13.0,1.0,5.0,143.0,191.0,1.0,33.0,1.0,4.0,...,0.0,1.0,0.0,0.0,0.0,3.0,30337.5,1.0,3.0,2.0
75%,23.0,21.0,2.0,8.0,181.0,212.0,1.0,43.0,2.0,4.0,...,0.0,1.0,0.0,0.0,0.0,5.0,47817.5,1.0,3.0,2.0
max,28.0,28.0,3.0,8.0,226.0,233.0,2.0,83.0,2.0,99.0,...,1.0,1.0,1.0,1.0,1.0,99.0,77655.0,1.0,99.0,99.0


In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Assuming df_subset contains your dataset with 'discrimination_level' as the target variable

# Separate features (X) and target variable (y)
X = df_subset_copy.drop('discrimination_level', axis=1)
y = df_subset_copy['discrimination_level']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

KeyError: "['discrimination_level'] not found in axis"