In [45]:
import pandas as pd
import os

### Slavery data PreProcessing

In [46]:
slavery_2016 = pd.read_csv("../datasets/slavery_2016.csv")[["Country", "POPULATION", "ESTIMATED NUMBER IN MODERN SLAVERY"]]
slavery_2018 = pd.read_csv("../datasets/slavery_2018.csv")[["Country ", "Est. prevalence of population in modern slavery (victims per 1,000 population)"]]
slavery_2023 = pd.read_csv("../datasets/slavery_2023.csv")[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [47]:
slavery_2016["Estimated prevalence of modern slavery per 1,000 population"] = (slavery_2016["ESTIMATED NUMBER IN MODERN SLAVERY"]/slavery_2016["POPULATION"])*1000
slavery_2016 = slavery_2016[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [48]:
slavery_2016.columns = ["Country", "2016"]
slavery_2018.columns = ["Country", "2018"]
slavery_2023.columns = ["Country", "2023"]

In [49]:
# Find common entries in the common column  
# NOTE: I HAVE GIVEN MORE IMPORTANCE TO COUNTRIES IN THE SLAVERY DATASET. (Since slavery data is the main comparing data here)
common_entries = set(slavery_2016['Country']).intersection(slavery_2018['Country']).intersection(slavery_2023['Country'])

In [50]:
# Filter the DataFrames to keep only rows with common entries
slavery_2016_filtered = slavery_2016[slavery_2016['Country'].isin(common_entries)]
slavery_2018_filtered = slavery_2018[slavery_2018['Country'].isin(common_entries)]
slavery_2023_filtered = slavery_2023[slavery_2023['Country'].isin(common_entries)]

In [51]:
# Merge the filtered DataFrames on the common column
temp1 = pd.merge(slavery_2016_filtered, slavery_2018_filtered, on='Country', how='inner')
slavery = pd.merge(temp1, slavery_2023_filtered, on='Country', how='inner')

In [52]:
slavery.head()

Unnamed: 0,Country,2016,2018,2023
0,Afghanistan,11.30138,22.2,13.0
1,Albania,2.959394,6.9,11.8
2,Algeria,6.259611,2.7,1.9
3,Angola,6.382384,7.2,4.1
4,Armenia,4.671968,5.3,8.9


In [53]:
slavery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  145 non-null    object 
 1   2016     145 non-null    float64
 2   2018     145 non-null    float64
 3   2023     139 non-null    float64
dtypes: float64(3), object(1)
memory usage: 4.7+ KB


In [54]:
slavery[slavery.isnull().any(axis=1)]

Unnamed: 0,Country,2016,2018,2023
20,Cape Verde,4.606526,4.1,
44,Iceland,1.208459,2.1,
64,Luxembourg,0.177936,1.5,
72,Montenegro,4.019293,5.9,
122,Barbados,2.112676,2.7,
142,Suriname,4.604052,2.3,


### Other parameters data preprocessing

In [55]:
corruption = pd.read_csv("../Datasets/corruption.csv")
democracy = pd.read_csv("../Datasets/democracy.csv")
gdppercapita = pd.read_csv("../Datasets/gdppercapita.csv")
lfpr = pd.read_csv("../Datasets/lfpr.csv")
migration = pd.read_csv("../Datasets/migration.csv")

In [56]:
lfpr = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr = lfpr[["Country", "Year", "LFPR"]]

In [57]:
lfpr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6079 entries, 2 to 18172
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  6079 non-null   object 
 1   Year     6079 non-null   int64  
 2   LFPR     6079 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 190.0+ KB


In [58]:
years = [2016, 2018, 2023]

In [59]:
corruption = corruption[corruption['Year'].isin(years)]
democracy = democracy[democracy['Year'].isin(years)]
gdppercapita = gdppercapita[gdppercapita['Year'].isin(years)]
lfpr = lfpr[lfpr['Year'].isin(years)]
migration = migration[migration['Year'].isin(years)]

In [60]:
# Find common entries in the common column
# common_entries = set(corruption['Country']).intersection(democracy['Country']).intersection(gdppercapita['Country']).intersection(lfpr['Country']).intersection(migration['Country'])

In [61]:
# Filter the DataFrames to keep only rows with common countries
corruption_filtered = corruption[corruption['Country'].isin(common_entries)]
democracy_filtered = democracy[democracy['Country'].isin(common_entries)]
gdppercapita_filtered = gdppercapita[gdppercapita['Country'].isin(common_entries)]
lfpr_filtered = lfpr[lfpr['Country'].isin(common_entries)]
migration_filtered = migration[migration['Country'].isin(common_entries)]

In [62]:
# Create a complete DataFrame with all combinations of Country and Year
all_combinations = pd.DataFrame([(country, year) for country in common_entries for year in years], columns=['Country', 'Year'])

In [63]:
all_combinations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  435 non-null    object
 1   Year     435 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [64]:
# Merge each DataFrame with the complete combinations to fill missing rows with NaN
corruption_complete = pd.merge(all_combinations, corruption_filtered, on=['Country', 'Year'], how='left')
democracy_complete = pd.merge(all_combinations, democracy_filtered, on=['Country', 'Year'], how='left')
gdppercapita_complete = pd.merge(all_combinations, gdppercapita_filtered, on=['Country', 'Year'], how='left')
lfpr_complete = pd.merge(all_combinations, lfpr_filtered, on=['Country', 'Year'], how='left')
migration_complete = pd.merge(all_combinations, migration_filtered, on=['Country', 'Year'], how='left')

In [65]:
# Merge the filtered DataFrames on the common column
temp2 = pd.merge(corruption_complete, democracy_complete, on=['Country', 'Year'], how='inner')
temp3 = pd.merge(temp2, gdppercapita_complete, on=['Country', 'Year'], how='inner')
temp4 = pd.merge(temp3, lfpr_complete, on=['Country', 'Year'], how='inner')
comparing_data = pd.merge(temp4, migration_complete, on=['Country', 'Year'], how='inner')

In [66]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,Rwanda,2016,54.0,3.07,1908.552,62.294,-2605.0
1,Rwanda,2018,56.0,3.35,2049.6235,62.081,-10794.0
2,Rwanda,2023,53.0,3.3,,,-8999.0
3,Malawi,2016,31.0,5.55,1437.0421,69.281,-5498.0
4,Malawi,2018,32.0,5.49,1478.4248,68.716,-3706.0


In [67]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,Rwanda,2016,54.0,3.07,1908.552,62.294,-2605.0
1,Rwanda,2018,56.0,3.35,2049.6235,62.081,-10794.0
2,Rwanda,2023,53.0,3.3,,,-8999.0
3,Malawi,2016,31.0,5.55,1437.0421,69.281,-5498.0
4,Malawi,2018,32.0,5.49,1478.4248,68.716,-3706.0


In [68]:
comparing_data[comparing_data.isnull().any(axis=1)]

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
2,Rwanda,2023,53.0,3.30,,,-8999.0
5,Malawi,2023,34.0,5.85,,,-6000.0
8,Belgium,2023,73.0,7.64,,,23999.0
11,Armenia,2023,47.0,5.42,,,-5000.0
14,Morocco,2023,38.0,5.04,,,-39998.0
...,...,...,...,...,...,...,...
430,Kyrgyzstan,2018,29.0,5.11,5133.152,,
431,Kyrgyzstan,2023,26.0,3.70,,,
432,Cuba,2016,47.0,3.46,,66.418,-14454.0
433,Cuba,2018,47.0,3.00,,67.063,-14219.0


### lfpr and slavery preprocess

In [69]:
lfpr = pd.read_csv("../Datasets/lfpr.csv")

In [70]:
lfpr = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr = lfpr[["Country", "Year", "LFPR"]]

In [71]:
lfpr = lfpr[lfpr['Year'].isin([2016, 2018])]

In [72]:
common_lfpr_slavery = set(slavery['Country']).intersection(lfpr['Country'])

In [73]:
# Filter the DataFrames to keep only rows with common entries
slavery_filtered = slavery[slavery['Country'].isin(common_lfpr_slavery)]
lfpr_filtered = lfpr[lfpr['Country'].isin(common_lfpr_slavery)]

In [74]:
# Pivot the DataFrame
lfpr_pivot = lfpr_filtered.pivot(index='Country', columns='Year', values='LFPR').reset_index()

# Rename the columns for clarity
lfpr_pivot.columns.name = None  # Remove the columns' name
lfpr_pivot.columns = ['Country', 2016, 2018]

In [75]:
slavery_reduced = slavery_filtered[["Country", "2016", "2018"]]
slavery_reduced.columns = ["Country", 2016, 2018]

In [76]:
slavery_reduced = slavery_reduced.sort_values(by='Country')
lfpr_pivot = lfpr_pivot.sort_values(by='Country')

In [77]:
slavery_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138 entries, 0 to 120
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 4.3+ KB


In [78]:
lfpr_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB


In [79]:
# Standardize Data
from sklearn.preprocessing import StandardScaler

In [80]:
value_columns = [2016, 2018]
scaler = StandardScaler()

# Z-score normalization
lfpr_pivot.loc[:, value_columns] = scaler.fit_transform(lfpr_pivot[value_columns])
slavery_reduced.loc[:, value_columns] = scaler.fit_transform(slavery_reduced[value_columns])

In [81]:
slavery_reduced.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,1.425598,1.78606
1,Albania,-0.479608,0.101637
2,Algeria,0.274121,-0.360754
3,Angola,0.302161,0.134665
121,Argentina,-0.232309,-0.514885


In [82]:
lfpr_pivot.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,-1.692985,-1.829335
1,Albania,0.005041,0.172467
2,Algeria,-2.010203,-1.972802
3,Angola,0.899857,0.811514
4,Argentina,-0.021815,0.024332


### Finding correlation between lfpr and slavery

In [83]:
slavLfpr_2016_corr = slavery_reduced[2016].corr(lfpr_pivot[2016])

In [84]:
slavLfpr_2016_corr

np.float64(0.01981696460095905)

In [85]:
slavLfpr_2018_corr = slavery_reduced[2018].corr(lfpr_pivot[2018])

In [86]:
slavLfpr_2018_corr

np.float64(0.037305512020338306)

Regression model