# Import Modules
This section imports the required modules and prepare the raw content of the file in the desired
data type for performance

In [1]:
import numpy as np
import pandas as pd
from zipfile import ZipFile
from pandas.api.types import CategoricalDtype

### Read file

In [2]:
# Read csv file ziped
zip_path = './data/human-trafficking-victims-dataset-ctdc.zip'
data_file = None
raw_data = None
with ZipFile(zip_path) as zip_file:
    for commpresed in zip_file.filelist:
        if commpresed.filename.endswith('csv'):
            with zip_file.open(commpresed) as data_file:
                raw_data = pd.read_csv(
                    data_file,
                    sep=';',
                    header=0,
                    low_memory=False,
                    usecols=range(1,64),)

### Data type convertion

In [3]:
# Setting values of categorical attribute
# for performnce at processing time
age_range = [
    '0--8',
    '9--17',
    '18--20',
    '21--23',
    '24--26',
    '27--29',
    '30--38',
    '39--47',
    '48+'
]
gender_values = [
    'Male',
    'Female',
    'Transgender/NonConforming',
]
age_cate = [
    'Adult',
    'Minor',
]
data_source = [
    'Case management',
    'Hotline'
]
# Categorical pandas objects
age_cat_range = CategoricalDtype(categories=age_range, ordered=True)
gender_cat = CategoricalDtype(categories=gender_values, ordered=False)
age_cat = CategoricalDtype(categories=age_cate, ordered=True)
data_cat_source = CategoricalDtype(categories=data_source, ordered=False)

In [4]:
# Convert all values read as float due to NaN
raw_data = raw_data.apply(lambda serie: serie.astype('Int32', errors='ignore'))
# Converto to categorical types
raw_data['Datasource'] = raw_data['Datasource'].astype(data_cat_source)
raw_data['gender'] = raw_data['gender'].astype(gender_cat)
raw_data['ageBroad'] = raw_data['ageBroad'].astype(age_cat_range)

columns_age_cat = ['majorityStatus', 'majorityStatusAtExploit', 'majorityEntry']
raw_data[columns_age_cat] = raw_data[columns_age_cat].astype(age_cat)

# Descriptive analysis



### Data presentation

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97750 entries, 0 to 97749
Data columns (total 63 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   yearOfRegistration                      95739 non-null  Int32   
 1   Datasource                              52607 non-null  category
 2   gender                                  89612 non-null  category
 3   ageBroad                                50967 non-null  category
 4   majorityStatus                          71246 non-null  category
 5   majorityStatusAtExploit                 21286 non-null  category
 6   majorityEntry                           7252 non-null   category
 7   citizenship                             53290 non-null  object  
 8   meansOfControlDebtBondage               8235 non-null   Int32   
 9   meansOfControlTakesEarnings             11438 non-null  Int32   
 10  meansOfControlRestrictsFinancialAccess  4331 n

In [6]:
print(f'The data-set has a total of records and atributes {raw_data.shape}\n')
print('Count total of data types in data-set')
unique_types = raw_data.dtypes.value_counts()
print(unique_types.to_string())

The data-set has a total of records and atributes (97750, 63)

Count total of data types in data-set
Int32       50
object       7
category     3
category     1
category     1
category     1


In [9]:
# NaN values per record in data-set
nan_record = raw_data.isna().sum(axis=1).describe()
print('Misising "NaN" attributes per record')
print(nan_record.to_string(header=True))

Misising "NaN" attributes per record
count    97750.000000
mean        36.462967
std         12.095918
min          2.000000
25%         28.000000
50%         35.000000
75%         51.000000
max         57.000000


In [28]:
# NaN values per attribute in data-set
nan_attribute = raw_data.isna().sum()
ratio_nan = nan_attribute / raw_data.shape[0]
print('Ratio of missing values in attributes', ratio_nan.to_string(), sep='\n')

Ratio of missing values in attributes
yearOfRegistration                        0.020573
Datasource                                0.461821
gender                                    0.083253
ageBroad                                  0.478598
majorityStatus                            0.271141
majorityStatusAtExploit                   0.782240
majorityEntry                             0.925811
citizenship                               0.454834
meansOfControlDebtBondage                 0.915754
meansOfControlTakesEarnings               0.882987
meansOfControlRestrictsFinancialAccess    0.955693
meansOfControlThreats                     0.841156
meansOfControlPsychologicalAbuse          0.849248
meansOfControlPhysicalAbuse               0.877923
meansOfControlSexualAbuse                 0.907795
meansOfControlFalsePromises               0.916890
meansOfControlPsychoactiveSubstances      0.887754
meansOfControlRestrictsMovement           0.855621
meansOfControlRestrictsMedicalCare        0.

In [26]:
duplicates = raw_data.duplicated().sum()
ration_dup = duplicates / raw_data.shape[0]
print(f'Total of elements duplicateds: {duplicates}\nRatio of all data-set: {ration_dup:.2%}')

Total of elements duplicateds: 77084
Ratio of all data-set: 78.86%


### Descriptive stats data type

There are two type of attributes in the data set, _nominal_ and _numeric_.
The _numeric_ attributes are mostly _ordinal_ binary used to indicat if the attribute is present in a record, by the other side, _nominal_ attributes where converted to a __Categorical__ data type of the package of _pandas_

In [11]:
# Numeric attributes description
raw_data.describe(exclude=['category', 'O'])

Unnamed: 0,yearOfRegistration,meansOfControlDebtBondage,meansOfControlTakesEarnings,meansOfControlRestrictsFinancialAccess,meansOfControlThreats,meansOfControlPsychologicalAbuse,meansOfControlPhysicalAbuse,meansOfControlSexualAbuse,meansOfControlFalsePromises,meansOfControlPsychoactiveSubstances,...,typeOfSexProstitution,typeOfSexPornography,typeOfSexRemoteInteractiveServices,typeOfSexPrivateSexualServices,isAbduction,recruiterRelationIntimatePartner,recruiterRelationFriend,recruiterRelationFamily,recruiterRelationOther,recruiterRelationUnknown
count,95739.0,8235.0,11438.0,4331.0,15527.0,14736.0,11933.0,9013.0,8124.0,10972.0,...,39662.0,30732.0,30732.0,29167.0,30422.0,57963.0,57963.0,57963.0,57963.0,97514.0
mean,2015.961186,0.493625,0.773999,0.133456,0.815161,0.835573,0.71097,0.463553,0.76034,0.553409,...,0.335006,0.060718,0.00384,0.016697,0.011965,0.062178,0.054069,0.05714,0.173663,0.797732
std,3.776725,0.49999,0.418258,0.340107,0.38818,0.370675,0.453331,0.498697,0.426902,0.497162,...,0.471998,0.238817,0.061847,0.128136,0.10873,0.24148,0.226156,0.232112,0.378822,0.401693
min,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2017.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2018.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Nominal attributes
raw_data.describe(include=['category'])

Unnamed: 0,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry
count,52607,89612,50967,71246,21286,7252
unique,1,3,9,2,2,2
top,Hotline,Female,9--17,Adult,Minor,Adult
freq,52607,68083,12507,51596,14253,5820


In [32]:
raw_data.median(numeric_only=True)

yearOfRegistration                        2017.0
meansOfControlDebtBondage                    0.0
meansOfControlTakesEarnings                  1.0
meansOfControlRestrictsFinancialAccess       0.0
meansOfControlThreats                        1.0
meansOfControlPsychologicalAbuse             1.0
meansOfControlPhysicalAbuse                  1.0
meansOfControlSexualAbuse                    0.0
meansOfControlFalsePromises                  1.0
meansOfControlPsychoactiveSubstances         1.0
meansOfControlRestrictsMovement              1.0
meansOfControlRestrictsMedicalCare           0.0
meansOfControlExcessiveWorkingHours          1.0
meansOfControlUsesChildren                   0.0
meansOfControlThreatOfLawEnforcement         1.0
meansOfControlWithholdsNecessities           1.0
meansOfControlWithholdsDocuments             1.0
meansOfControlOther                          1.0
meansOfControlNotSpecified                   1.0
isForcedLabour                               0.0
isSexualExploit     