In [3]:
import pandas as pd
import os

### Slavery data PreProcessing

In [4]:
slavery_2016 = pd.read_csv("../datasets/slavery_2016.csv")[["Country", "POPULATION", "ESTIMATED NUMBER IN MODERN SLAVERY"]]
slavery_2018 = pd.read_csv("../datasets/slavery_2018.csv")[["Country ", "Est. prevalence of population in modern slavery (victims per 1,000 population)"]]
slavery_2023 = pd.read_csv("../datasets/slavery_2023.csv")[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [5]:
slavery_2016["Estimated prevalence of modern slavery per 1,000 population"] = (slavery_2016["ESTIMATED NUMBER IN MODERN SLAVERY"]/slavery_2016["POPULATION"])*1000
slavery_2016 = slavery_2016[["Country", "Estimated prevalence of modern slavery per 1,000 population"]]

In [6]:
slavery_2016.columns = ["Country", "2016"]
slavery_2018.columns = ["Country", "2018"]
slavery_2023.columns = ["Country", "2023"]

In [7]:
# Find common entries in the common column  
# NOTE: I HAVE GIVEN MORE IMPORTANCE TO COUNTRIES IN THE SLAVERY DATASET. (Since slavery data is the main comparing data here)
common_entries = set(slavery_2016['Country']).intersection(slavery_2018['Country']).intersection(slavery_2023['Country'])

In [8]:
# Filter the DataFrames to keep only rows with common entries
slavery_2016_filtered = slavery_2016[slavery_2016['Country'].isin(common_entries)]
slavery_2018_filtered = slavery_2018[slavery_2018['Country'].isin(common_entries)]
slavery_2023_filtered = slavery_2023[slavery_2023['Country'].isin(common_entries)]

In [9]:
# Merge the filtered DataFrames on the common column
temp1 = pd.merge(slavery_2016_filtered, slavery_2018_filtered, on='Country', how='inner')
slavery = pd.merge(temp1, slavery_2023_filtered, on='Country', how='inner')

In [10]:
slavery.head()

Unnamed: 0,Country,2016,2018,2023
0,Afghanistan,11.30138,22.2,13.0
1,Albania,2.959394,6.9,11.8
2,Algeria,6.259611,2.7,1.9
3,Angola,6.382384,7.2,4.1
4,Armenia,4.671968,5.3,8.9


In [11]:
slavery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  145 non-null    object 
 1   2016     145 non-null    float64
 2   2018     145 non-null    float64
 3   2023     139 non-null    float64
dtypes: float64(3), object(1)
memory usage: 4.7+ KB


In [12]:
slavery[slavery.isnull().any(axis=1)]

Unnamed: 0,Country,2016,2018,2023
20,Cape Verde,4.606526,4.1,
44,Iceland,1.208459,2.1,
64,Luxembourg,0.177936,1.5,
72,Montenegro,4.019293,5.9,
122,Barbados,2.112676,2.7,
142,Suriname,4.604052,2.3,


### Other parameters data preprocessing

In [13]:
corruption = pd.read_csv("../Datasets/corruption.csv")
democracy = pd.read_csv("../Datasets/democracy.csv")
gdppercapita = pd.read_csv("../Datasets/gdppercapita.csv")
lfpr = pd.read_csv("../Datasets/lfpr.csv")
migration = pd.read_csv("../Datasets/migration.csv")

In [14]:
lfpr = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr = lfpr[["Country", "Year", "LFPR"]]

In [15]:
lfpr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6079 entries, 2 to 18172
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  6079 non-null   object 
 1   Year     6079 non-null   int64  
 2   LFPR     6079 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 190.0+ KB


In [16]:
years = [2016, 2018, 2023]

In [17]:
corruption = corruption[corruption['Year'].isin(years)]
democracy = democracy[democracy['Year'].isin(years)]
gdppercapita = gdppercapita[gdppercapita['Year'].isin(years)]
lfpr = lfpr[lfpr['Year'].isin(years)]
migration = migration[migration['Year'].isin(years)]

In [18]:
# Find common entries in the common column
# common_entries = set(corruption['Country']).intersection(democracy['Country']).intersection(gdppercapita['Country']).intersection(lfpr['Country']).intersection(migration['Country'])

In [19]:
# Filter the DataFrames to keep only rows with common countries
corruption_filtered = corruption[corruption['Country'].isin(common_entries)]
democracy_filtered = democracy[democracy['Country'].isin(common_entries)]
gdppercapita_filtered = gdppercapita[gdppercapita['Country'].isin(common_entries)]
lfpr_filtered = lfpr[lfpr['Country'].isin(common_entries)]
migration_filtered = migration[migration['Country'].isin(common_entries)]

In [20]:
# Create a complete DataFrame with all combinations of Country and Year
all_combinations = pd.DataFrame([(country, year) for country in common_entries for year in years], columns=['Country', 'Year'])

In [21]:
all_combinations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  435 non-null    object
 1   Year     435 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [22]:
# Merge each DataFrame with the complete combinations to fill missing rows with NaN
corruption_complete = pd.merge(all_combinations, corruption_filtered, on=['Country', 'Year'], how='left')
democracy_complete = pd.merge(all_combinations, democracy_filtered, on=['Country', 'Year'], how='left')
gdppercapita_complete = pd.merge(all_combinations, gdppercapita_filtered, on=['Country', 'Year'], how='left')
lfpr_complete = pd.merge(all_combinations, lfpr_filtered, on=['Country', 'Year'], how='left')
migration_complete = pd.merge(all_combinations, migration_filtered, on=['Country', 'Year'], how='left')

In [23]:
# Merge the filtered DataFrames on the common column
temp2 = pd.merge(corruption_complete, democracy_complete, on=['Country', 'Year'], how='inner')
temp3 = pd.merge(temp2, gdppercapita_complete, on=['Country', 'Year'], how='inner')
temp4 = pd.merge(temp3, lfpr_complete, on=['Country', 'Year'], how='inner')
comparing_data = pd.merge(temp4, migration_complete, on=['Country', 'Year'], how='inner')

In [24]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,Pakistan,2016,32.0,4.33,4746.718,53.389,-2290411.0
1,Pakistan,2018,33.0,4.17,5113.434,52.689,-1309609.0
2,Pakistan,2023,29.0,3.25,,,-165988.0
3,Haiti,2016,20.0,4.02,3165.2957,67.149,-33902.0
4,Haiti,2018,20.0,4.91,3209.4297,67.254,-35007.0


In [25]:
comparing_data.head()

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
0,Pakistan,2016,32.0,4.33,4746.718,53.389,-2290411.0
1,Pakistan,2018,33.0,4.17,5113.434,52.689,-1309609.0
2,Pakistan,2023,29.0,3.25,,,-165988.0
3,Haiti,2016,20.0,4.02,3165.2957,67.149,-33902.0
4,Haiti,2018,20.0,4.91,3209.4297,67.254,-35007.0


In [26]:
comparing_data[comparing_data.isnull().any(axis=1)]

Unnamed: 0,Country,Year,Corruption,Democracy score,GDP per capita,LFPR,Migration
2,Pakistan,2023,29.0,3.25,,,-165988.0
5,Haiti,2023,17.0,2.81,,,-31811.0
6,Egypt,2016,34.0,3.31,10765.829,,
7,Egypt,2018,35.0,3.36,11363.318,,
8,Egypt,2023,35.0,2.93,,,
...,...,...,...,...,...,...,...
422,Cuba,2023,42.0,2.65,,,-6000.0
425,Malawi,2023,34.0,5.85,,,-6000.0
428,Australia,2023,75.0,8.66,,,139991.0
431,Netherlands,2023,79.0,9.00,,,29998.0


### lfpr and slavery preprocess

In [27]:
lfpr = pd.read_csv("../Datasets/lfpr.csv")

In [28]:
lfpr = lfpr[lfpr['Type_LFPR'].isin(['Total'])]  # removing male, female entries
lfpr = lfpr[["Country", "Year", "LFPR"]]

In [29]:
lfpr = lfpr[lfpr['Year'].isin([2016, 2018])]

In [30]:
common_lfpr_slavery = set(slavery['Country']).intersection(lfpr['Country'])

In [31]:
# Filter the DataFrames to keep only rows with common entries
slavery_filtered = slavery[slavery['Country'].isin(common_lfpr_slavery)]
lfpr_filtered = lfpr[lfpr['Country'].isin(common_lfpr_slavery)]

In [32]:
# Pivot the DataFrame
lfpr_pivot = lfpr_filtered.pivot(index='Country', columns='Year', values='LFPR').reset_index()

# Rename the columns for clarity
lfpr_pivot.columns.name = None  # Remove the columns' name
lfpr_pivot.columns = ['Country', 2016, 2018]

In [33]:
slavery_reduced = slavery_filtered[["Country", "2016", "2018"]]
slavery_reduced.columns = ["Country", 2016, 2018]

In [34]:
slavery_reduced = slavery_reduced.sort_values(by='Country')
lfpr_pivot = lfpr_pivot.sort_values(by='Country')

In [35]:
slavery_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138 entries, 0 to 120
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 4.3+ KB


In [36]:
lfpr_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  138 non-null    object 
 1   2016     138 non-null    float64
 2   2018     138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB


In [37]:
# Standardize Data
from sklearn.preprocessing import StandardScaler

In [38]:
value_columns = [2016, 2018]
scaler = StandardScaler()

# Z-score normalization
lfpr_pivot.loc[:, value_columns] = scaler.fit_transform(lfpr_pivot[value_columns])
slavery_reduced.loc[:, value_columns] = scaler.fit_transform(slavery_reduced[value_columns])

In [39]:
slavery_reduced.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,1.425598,1.78606
1,Albania,-0.479608,0.101637
2,Algeria,0.274121,-0.360754
3,Angola,0.302161,0.134665
121,Argentina,-0.232309,-0.514885


In [40]:
lfpr_pivot.head()

Unnamed: 0,Country,2016,2018
0,Afghanistan,-1.692985,-1.829335
1,Albania,0.005041,0.172467
2,Algeria,-2.010203,-1.972802
3,Angola,0.899857,0.811514
4,Argentina,-0.021815,0.024332


### Finding correlation between lfpr and slavery

In [41]:
slavLfpr_2016_corr = slavery_reduced[2016].corr(lfpr_pivot[2016])

In [42]:
slavLfpr_2016_corr

np.float64(0.01981696460095905)

In [43]:
slavLfpr_2018_corr = slavery_reduced[2018].corr(lfpr_pivot[2018])

In [44]:
slavLfpr_2018_corr

np.float64(0.037305512020338306)

Regression model

In [46]:
print(comparing_data.columns)

Index(['Country', 'Year', 'Corruption', 'Democracy score', 'GDP per capita',
       'LFPR', 'Migration'],
      dtype='object')


In [48]:
# Define the independent variables (economic indicators)
independent_vars = ['Corruption', 'Democracy score', 'GDP per capita', 'LFPR', 'Migration']  # Update with actual column names from comparing_data

# Dependent variable
target_vars = ['slavery_filtered']  # Prevalence of Modern Slavery for different years

# Filter data for those years and drop NaNs
comparing_data_filtered = comparing_data.dropna(subset=independent_vars + target_vars)

# Fit the regression models for all three years
for year in target_vars:
    # Create a formula string for the regression model for the given year
    formula = f"{year} ~ " + " + ".join(independent_vars)
    
    # Fit the regression model for the given year
    model = smf.ols(formula=formula, data=comparing_data_filtered).fit()
    
    # Print the summary of the regression model for the given year
    print(f"Regression Results for {year}:")
    print(model.summary())
    print("\n" + "-"*80 + "\n")

KeyError: ['slavery_filtered']

In [None]:
# Regression model 2: LFPR and Modern Slavery with Control Variables
X2 = all_data[['LFPR'] + value_columns[1:]]
y2 = all_data['prevalence_per_1000']
X2 = sm.add_constant(X2)
model2 = sm.OLS(y2, X2).fit()
print("LFPR and Modern Slavery with Control Variables:")
print(model2.summary())
