In [1]:
import pandas as pd
import os

### Slavery data PreProcessing

In [2]:
slavery_2016 = pd.read_csv("../Datasets/slavery_2016.csv")[["Country", "POPULATION", "ESTIMATED NUMBER IN MODERN SLAVERY"]]
slavery_2018 = pd.read_csv("../Datasets/slavery_2018.csv")[["Country ", "Est. prevalence of population in modern slavery (victims per 1,000 population)"]]
slavery_2023 = pd.read_csv("../Datasets/slavery_2023.csv")[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [3]:
slavery_2016["Estimated prevalence of modern slavery per 1,000 population"] = (slavery_2016["ESTIMATED NUMBER IN MODERN SLAVERY"]/slavery_2016["POPULATION"])*1000
slavery_2016 = slavery_2016[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [4]:
slavery_2016.columns = ["Country", "2016"]
slavery_2018.columns = ["Country", "2018"]
slavery_2023.columns = ["Country", "2023"]

In [5]:
# Find common entries in the common column  
# NOTE: I HAVE GIVEN MORE IMPORTANCE TO COUNTRIES IN THE SALVERY DATASET. (Since slavery data is the main comparing data here)
common_entries = set(slavery_2016['Country']).intersection(slavery_2018['Country']).intersection(slavery_2023['Country'])

In [6]:
# Filter the DataFrames to keep only rows with common entries
slavery_2016_filtered = slavery_2016[slavery_2016['Country'].isin(common_entries)]
slavery_2018_filtered = slavery_2018[slavery_2018['Country'].isin(common_entries)]
slavery_2023_filtered = slavery_2023[slavery_2023['Country'].isin(common_entries)]

In [7]:
# Merge the filtered DataFrames on the common column
temp1 = pd.merge(slavery_2016_filtered, slavery_2018_filtered, on='Country', how='inner')
slavery = pd.merge(temp1, slavery_2023_filtered, on='Country', how='inner')

In [8]:
slavery.head()

Unnamed: 0,Country,2016,2018,2023
0,Afghanistan,11.30138,22.2,13.0
1,Albania,2.959394,6.9,11.8
2,Algeria,6.259611,2.7,1.9
3,Angola,6.382384,7.2,4.1
4,Armenia,4.671968,5.3,8.9


In [9]:
slavery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  145 non-null    object 
 1   2016     145 non-null    float64
 2   2018     145 non-null    float64
 3   2023     139 non-null    float64
dtypes: float64(3), object(1)
memory usage: 4.7+ KB


In [10]:
slavery[slavery.isnull().any(axis=1)]

Unnamed: 0,Country,2016,2018,2023
20,Cape Verde,4.606526,4.1,
44,Iceland,1.208459,2.1,
64,Luxembourg,0.177936,1.5,
72,Montenegro,4.019293,5.9,
122,Barbados,2.112676,2.7,
142,Suriname,4.604052,2.3,


### Other parameters data preprocessing

In [11]:
corruption = pd.read_csv("../Datasets/corruption.csv")
democracy = pd.read_csv("../Datasets/democracy.csv")
gdppercapita = pd.read_csv("../Datasets/gdppercapita.csv")
lfpr = pd.read_csv("../Datasets/lfpr.csv")
migration = pd.read_csv("../Datasets/migration.csv")

In [12]:
lfpr = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr = lfpr[["Country", "Year", "LFPR"]]

In [13]:
lfpr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6079 entries, 2 to 18172
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  6079 non-null   object 
 1   Year     6079 non-null   int64  
 2   LFPR     6079 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 190.0+ KB


In [14]:
years = [2016, 2018, 2023]

In [15]:
corruption = corruption[corruption['Year'].isin(years)]
democracy = democracy[democracy['Year'].isin(years)]
gdppercapita = gdppercapita[gdppercapita['Year'].isin(years)]
lfpr = lfpr[lfpr['Year'].isin(years)]
migration = migration[migration['Year'].isin(years)]

In [16]:
# Find common entries in the common column
# common_entries = set(corruption['Country']).intersection(democracy['Country']).intersection(gdppercapita['Country']).intersection(lfpr['Country']).intersection(migration['Country'])

In [17]:
# Filter the DataFrames to keep only rows with common countries
corruption_filtered = corruption[corruption['Country'].isin(common_entries)]
democracy_filtered = democracy[democracy['Country'].isin(common_entries)]
gdppercapita_filtered = gdppercapita[gdppercapita['Country'].isin(common_entries)]
lfpr_filtered = lfpr[lfpr['Country'].isin(common_entries)]
migration_filtered = migration[migration['Country'].isin(common_entries)]

In [18]:
# Create a complete DataFrame with all combinations of Country and Year
all_combinations = pd.DataFrame([(country, year) for country in common_entries for year in years], columns=['Country', 'Year'])

In [19]:
all_combinations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  435 non-null    object
 1   Year     435 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [20]:
# Merge each DataFrame with the complete combinations to fill missing rows with NaN
corruption_complete = pd.merge(all_combinations, corruption_filtered, on=['Country', 'Year'], how='left')
democracy_complete = pd.merge(all_combinations, democracy_filtered, on=['Country', 'Year'], how='left')
gdppercapita_complete = pd.merge(all_combinations, gdppercapita_filtered, on=['Country', 'Year'], how='left')
lfpr_complete = pd.merge(all_combinations, lfpr_filtered, on=['Country', 'Year'], how='left')
migration_complete = pd.merge(all_combinations, migration_filtered, on=['Country', 'Year'], how='left')

In [21]:
# Merge the filtered DataFrames on the common column
temp2 = pd.merge(corruption_complete, democracy_complete, on=['Country', 'Year'], how='inner')
temp3 = pd.merge(temp2, gdppercapita_complete, on=['Country', 'Year'], how='inner')
temp4 = pd.merge(temp3, lfpr_complete, on=['Country', 'Year'], how='inner')
comparing_data = pd.merge(temp4, migration_complete, on=['Country', 'Year'], how='inner')

In [22]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,United Arab Emirates,2016,66.0,2.75,71244.586,81.627,-13520.0
1,United Arab Emirates,2018,70.0,2.76,71550.555,82.864,-8582.0
2,United Arab Emirates,2023,68.0,3.01,,,0.0
3,South Africa,2016,45.0,7.41,13844.276,61.764,-866101.0
4,South Africa,2018,43.0,7.24,13995.0625,62.202,27265.0


In [23]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,United Arab Emirates,2016,66.0,2.75,71244.586,81.627,-13520.0
1,United Arab Emirates,2018,70.0,2.76,71550.555,82.864,-8582.0
2,United Arab Emirates,2023,68.0,3.01,,,0.0
3,South Africa,2016,45.0,7.41,13844.276,61.764,-866101.0
4,South Africa,2018,43.0,7.24,13995.0625,62.202,27265.0


In [24]:
comparing_data[comparing_data.isnull().any(axis=1)]

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
2,United Arab Emirates,2023,68.0,3.01,,,0.0
5,South Africa,2023,41.0,7.05,,,58496.0
8,Japan,2023,73.0,8.40,,,99994.0
11,France,2023,71.0,8.07,,,67761.0
12,Kosovo,2016,36.0,,10031.048,,-31383.0
...,...,...,...,...,...,...,...
422,Togo,2023,31.0,2.99,,,-2000.0
425,Libya,2023,18.0,,,,-2000.0
428,Nigeria,2023,25.0,4.23,,,-59996.0
431,Poland,2023,54.0,7.18,,,-910475.0


### lfpr (male, female, both) and slavery preprocess

In [25]:
lfpr = pd.read_csv("../Datasets/lfpr.csv")

In [26]:
lfpr.head()

Unnamed: 0,Country,Year,LFPR,Type_LFPR
0,Afghanistan,1991,16.101,Female
1,Afghanistan,1991,80.99,Male
2,Afghanistan,1991,48.039,Total
3,Afghanistan,1992,16.17,Female
4,Afghanistan,1992,80.94,Male


In [27]:
lfpr_total = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr_male = lfpr[lfpr['Type_LFPR'].isin(['Male'])]
lfpr_female = lfpr[lfpr['Type_LFPR'].isin(['Female'])]

In [28]:
lfpr_total = lfpr_total[["Country", "Year", "LFPR"]]
lfpr_male = lfpr_male[["Country", "Year", "LFPR"]]
lfpr_female = lfpr_female[["Country", "Year", "LFPR"]]

In [29]:
lfpr_total = lfpr_total[lfpr_total['Year'].isin([2016, 2018])]
lfpr_male = lfpr_male[lfpr_male['Year'].isin([2016, 2018])]
lfpr_female = lfpr_female[lfpr_female['Year'].isin([2016, 2018])]

In [30]:
common_lfpr_slavery = set(slavery['Country']).intersection(lfpr_total['Country'])

In [31]:
# Filter the DataFrames to keep only rows with common entries
slavery_filtered = slavery[slavery['Country'].isin(common_lfpr_slavery)]

lfpr_total_filtered = lfpr_total[lfpr_total['Country'].isin(common_lfpr_slavery)]
lfpr_male_filtered = lfpr_male[lfpr_male['Country'].isin(common_lfpr_slavery)]
lfpr_female_filtered = lfpr_female[lfpr_female['Country'].isin(common_lfpr_slavery)]

In [32]:
# Pivot the DataFrame
lfpr_total_pivot = lfpr_total_filtered.pivot(index='Country', columns='Year', values='LFPR').reset_index()
lfpr_male_pivot = lfpr_male_filtered.pivot(index='Country', columns='Year', values='LFPR').reset_index()
lfpr_female_pivot = lfpr_female_filtered.pivot(index='Country', columns='Year', values='LFPR').reset_index()

# Rename the columns for clarity
lfpr_total_pivot.columns.name = None
lfpr_total_pivot.columns = ['Country', 2016, 2018]

lfpr_male_pivot.columns.name = None
lfpr_male_pivot.columns = ['Country', 2016, 2018]

lfpr_female_pivot.columns.name = None
lfpr_female_pivot.columns = ['Country', 2016, 2018]

In [33]:
slavery_reduced = slavery_filtered[["Country", "2016", "2018"]]
slavery_reduced.columns = ["Country", 2016, 2018]

In [34]:
slavery_reduced = slavery_reduced.sort_values(by='Country')

lfpr_total_pivot = lfpr_total_pivot.sort_values(by='Country')
lfpr_male_pivot = lfpr_male_pivot.sort_values(by='Country')
lfpr_female_pivot = lfpr_female_pivot.sort_values(by='Country')

In [35]:
slavery_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138 entries, 0 to 120
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 4.3+ KB


In [36]:
lfpr_total_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB


In [37]:
lfpr_male_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB


In [38]:
lfpr_female_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB


In [39]:
# Standardize Data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [40]:
value_columns = [2016, 2018]
scaler = StandardScaler()

# Z-score normalization
lfpr_total_pivot.loc[:, value_columns] = scaler.fit_transform(lfpr_total_pivot[value_columns])
lfpr_male_pivot.loc[:, value_columns] = scaler.fit_transform(lfpr_male_pivot[value_columns])
lfpr_female_pivot.loc[:, value_columns] = scaler.fit_transform(lfpr_female_pivot[value_columns])

slavery_reduced.loc[:, value_columns] = scaler.fit_transform(slavery_reduced[value_columns])

In [41]:
slavery_reduced.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,1.425598,1.78606
1,Albania,-0.479608,0.101637
2,Algeria,0.274121,-0.360754
3,Angola,0.302161,0.134665
121,Argentina,-0.232309,-0.514885


In [42]:
lfpr_total_pivot.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,-1.692985,-1.829335
1,Albania,0.005041,0.172467
2,Algeria,-2.010203,-1.972802
3,Angola,0.899857,0.811514
4,Argentina,-0.021815,0.024332


In [43]:
slavery_reduced.describe()

Unnamed: 0,2016,2018
count,138.0,138.0
mean,-1.415937e-16,5.792468000000001e-17
std,1.003643,1.003643
min,-1.11628,-0.6249776
25%,-0.4992309,-0.4240577
50%,-0.153779,-0.2341472
75%,0.3023753,0.09888431
max,7.918369,9.580649


In [44]:
lfpr_total_pivot.describe()

Unnamed: 0,2016,2018
count,138.0,138.0
mean,-1.0169e-15,6.307354e-16
std,1.003643,1.003643
min,-3.049879,-3.019356
25%,-0.5025409,-0.5712169
50%,0.08579451,0.1581477
75%,0.755458,0.7120841
max,2.055512,1.895326


### Finding correlation between lfpr and slavery

In [45]:
slavLfpr_total_2016_corr = slavery_reduced[2016].corr(lfpr_total_pivot[2016])
slavLfpr_male_2016_corr = slavery_reduced[2016].corr(lfpr_male_pivot[2016])
slavLfpr_female_2016_corr = slavery_reduced[2016].corr(lfpr_female_pivot[2016])

In [46]:
print("slavLfpr_total_2016_corr: ", slavLfpr_total_2016_corr)
print("slavLfpr_male_2016_corr: ", slavLfpr_male_2016_corr)
print("slavLfpr_female_2016_corr: ", slavLfpr_female_2016_corr)

slavLfpr_total_2016_corr:  0.01981696460095905
slavLfpr_male_2016_corr:  -0.022584362727162906
slavLfpr_female_2016_corr:  0.046248767279351725


In [47]:
slavLfpr_total_2018_corr = slavery_reduced[2018].corr(lfpr_total_pivot[2018])
slavLfpr_male_2018_corr = slavery_reduced[2018].corr(lfpr_male_pivot[2018])
slavLfpr_female_2018_corr = slavery_reduced[2018].corr(lfpr_female_pivot[2018])

In [48]:
print("slavLfpr_total_2016_corr: ", slavLfpr_total_2018_corr)
print("slavLfpr_male_2016_corr: ", slavLfpr_male_2018_corr)
print("slavLfpr_female_2016_corr: ", slavLfpr_female_2018_corr)

slavLfpr_total_2016_corr:  0.037305512020338306
slavLfpr_male_2016_corr:  0.005393190619142592
slavLfpr_female_2016_corr:  0.05980669007914195


### gdppercapita and slavery preprocess

In [49]:
gdppercapita = pd.read_csv("Datasets/gdppercapita.csv")

In [50]:
gdppercapita.head()

Unnamed: 0,Country,Year,GDP per capita
0,Afghanistan,2002.0,1280.4631
1,Afghanistan,2003.0,1292.3335
2,Afghanistan,2004.0,1260.0605
3,Afghanistan,2005.0,1352.3207
4,Afghanistan,2006.0,1366.9932


In [51]:
gdppercapita = gdppercapita[gdppercapita['Year'].isin([2016, 2018])]

In [52]:
common_gdppercapita_slavery = set(slavery['Country']).intersection(gdppercapita['Country'])

In [53]:
diff_set1 = common_lfpr_slavery - common_gdppercapita_slavery
diff_set2 = common_gdppercapita_slavery - common_lfpr_slavery
print("Countries in set1 but not in set2:", diff_set1)
print("Countries in set2 but not in set1:", diff_set2)

Countries in set1 but not in set2: {'South Sudan', 'Timor-Leste', 'Eritrea', 'Cuba'}
Countries in set2 but not in set1: {'Kyrgyzstan', 'Gambia', 'Slovakia', 'Egypt', 'Kosovo', 'Cape Verde'}


In [54]:
# Filter the DataFrames to keep only rows with common entries
slavery_filtered = slavery[slavery['Country'].isin(common_gdppercapita_slavery)]

gdppercapita_filtered = gdppercapita[gdppercapita['Country'].isin(common_gdppercapita_slavery)]

In [55]:
# Pivot the DataFrame
gdppercapita_pivot = gdppercapita_filtered.pivot(index='Country', columns='Year', values='GDP per capita').reset_index()

In [56]:
slavery_reduced = slavery_filtered[["Country", "2016", "2018"]]

slavery_reduced.columns = ["Country", 2016, 2018]
gdppercapita_pivot.columns = ["Country", 2016, 2018]

slavery_reduced = slavery_reduced.sort_values(by='Country')
gdppercapita_pivot = gdppercapita_pivot.sort_values(by='Country')

In [57]:
gdppercapita_pivot.describe()

Unnamed: 0,2016,2018
count,140.0,140.0
mean,20634.994686,21413.408378
std,21319.948995,21759.560787
min,764.3366,740.44824
25%,4643.7817,5049.1466
50%,13043.3545,14102.32775
75%,28704.707,31420.1795
max,116283.7,114164.47


In [58]:
slavery_reduced.describe()

Unnamed: 0,2016,2018
count,140.0,140.0
mean,5.0252,5.187143
std,4.344604,5.037555
min,0.171711,0.3
25%,2.904559,2.175
50%,4.542213,3.9
75%,6.382604,6.475
max,39.73012,40.0


In [59]:
value_columns = [2016, 2018]
scaler = StandardScaler()

# Z-score normalization
gdppercapita_pivot.loc[:, value_columns] = scaler.fit_transform(gdppercapita_pivot[value_columns])
slavery_reduced.loc[:, value_columns] = scaler.fit_transform(slavery_reduced[value_columns])

### Finding correlation between gdppercapita and slavery

In [60]:
slavGdp_2016_corr = slavery_reduced[2016].corr(gdppercapita_pivot[2016])
slavGdp_2018_corr = slavery_reduced[2018].corr(gdppercapita_pivot[2018])

print("slavGdp_2016_corr: ", slavGdp_2016_corr)
print("slavGdp_2018_corr: ", slavGdp_2018_corr)

slavGdp_2016_corr:  0.02820868460160493
slavGdp_2018_corr:  -0.09580752609885806


### migration and slavery preprocess

In [61]:
migration = pd.read_csv("Datasets/migration.csv")

In [62]:
migration.head()

Unnamed: 0,Country,Year,Migration
0,Afghanistan,1960,2606
1,Afghanistan,1961,6109
2,Afghanistan,1962,7016
3,Afghanistan,1963,6681
4,Afghanistan,1964,7079


In [63]:
migration = migration[migration['Year'].isin([2016, 2018, 2023])]

In [64]:
common_migration_slavery = set(slavery['Country']).intersection(migration['Country'])

In [65]:
diff_set1 = common_lfpr_slavery - common_migration_slavery
diff_set2 = common_migration_slavery - common_lfpr_slavery
print("Countries in set1 but not in set2:", diff_set1)
print("Countries in set2 but not in set1:", diff_set2)

Countries in set1 but not in set2: set()
Countries in set2 but not in set1: {'Kosovo'}


In [66]:
# Filter the DataFrames to keep only rows with common entries
slavery_filtered = slavery[slavery['Country'].isin(common_migration_slavery)]

migration_filtered = migration[migration['Country'].isin(common_migration_slavery)]

In [67]:
# Pivot the DataFrame
migration_pivot = migration_filtered.pivot(index='Country', columns='Year', values='Migration').reset_index()

In [68]:
slavery_reduced = slavery_filtered[["Country", "2016", "2018", "2023"]]

slavery_reduced.columns = ["Country", 2016, 2018, 2023]
migration_pivot.columns = ["Country", 2016, 2018, 2023]

slavery_reduced = slavery_reduced.sort_values(by='Country')
migration_pivot = migration_pivot.sort_values(by='Country')

In [69]:
migration_pivot.describe()

Unnamed: 0,2016,2018,2023
count,139.0,139.0,139.0
mean,-14073.24,-939.1439,-8797.396
std,229314.8,162420.1,187870.6
min,-2290411.0,-1309609.0,-910475.0
25%,-25762.0,-22330.5,-15006.5
50%,-982.0,-2024.0,-4000.0
75%,27768.5,24012.5,3359.0
max,432017.0,494364.0,1784718.0


In [70]:
value_columns = [2016, 2018, 2023]
scaler = StandardScaler()

# Z-score normalization
migration_pivot.loc[:, value_columns] = scaler.fit_transform(migration_pivot[value_columns])
slavery_reduced.loc[:, value_columns] = scaler.fit_transform(slavery_reduced[value_columns])

### Finding correlation between migration and slavery

In [71]:
slavMigration_2016_corr = slavery_reduced[2016].corr(migration_pivot[2016])
slavMigration_2018_corr = slavery_reduced[2018].corr(migration_pivot[2018])
slavMigration_2023_corr = slavery_reduced[2023].corr(migration_pivot[2023])

print("slavMigration_2016_corr: ", slavMigration_2016_corr)
print("slavMigration_2018_corr: ", slavMigration_2018_corr)
print("slavMigration_2023_corr: ", slavMigration_2023_corr)

slavMigration_2016_corr:  -0.1345895732639958
slavMigration_2018_corr:  -0.008146733551690358
slavMigration_2023_corr:  0.0040715348807804945
