# Import Modules
This section imports the required modules and prepare the raw content of the file in the desired
data type for performance

In [None]:
import numpy as np
import pandas as pd
from zipfile import ZipFile
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt

### Read file

In [None]:
# Read csv file ziped
zip_path = './data/human-trafficking-victims-dataset-ctdc.zip'
data_file = None
raw_data = None
with ZipFile(zip_path) as zip_file:
    for commpresed in zip_file.filelist:
        if commpresed.filename.endswith('csv'):
            with zip_file.open(commpresed) as data_file:
                raw_data = pd.read_csv(
                    data_file,
                    sep=';',
                    header=0,
                    low_memory=False,
                    usecols=range(1,64),)

### Data type convertion

In [None]:
# Setting values of categorical attribute
# for performnce at processing time
age_range = [
    '0--8',
    '9--17',
    '18--20',
    '21--23',
    '24--26',
    '27--29',
    '30--38',
    '39--47',
    '48+'
]
gender_values = [
    'Male',
    'Female',
    'Transgender/NonConforming',
]
age_cate = [
    'Adult',
    'Minor',
]
data_source = [
    'Case Management',
    'Hotline'
]
# Categorical pandas objects
age_cat_range = CategoricalDtype(categories=age_range, ordered=True)
gender_cat = CategoricalDtype(categories=gender_values, ordered=False)
age_cat = CategoricalDtype(categories=age_cate, ordered=True)
data_cat_source = CategoricalDtype(categories=data_source, ordered=False)

In [None]:
# Convert all values read as float due to NaN
raw_data = raw_data.apply(lambda serie: serie.astype('Int32', errors='ignore'))
# Converto to categorical types
raw_data['Datasource'] = raw_data['Datasource'].astype(data_cat_source)
raw_data['gender'] = raw_data['gender'].astype(gender_cat)
raw_data['ageBroad'] = raw_data['ageBroad'].astype(age_cat_range)

columns_age_cat = ['majorityStatus', 'majorityStatusAtExploit', 'majorityEntry']
raw_data[columns_age_cat] = raw_data[columns_age_cat].astype(age_cat)

# Descriptive analysis



### Data presentation

In [None]:
raw_data.info()

In [None]:
print(f'The data-set has a total of records and atributes {raw_data.shape}\n')
print('Count total of data types in data-set')
unique_types = raw_data.dtypes.value_counts()
print(unique_types.to_string())

In [None]:
# NaN values per record in data-set
nan_record = raw_data.isna().sum(axis=1).describe()
print('Misising "NaN" attributes per record')
print(nan_record.to_string(header=True))

In [None]:
# NaN values per attribute in data-set
nan_attribute = raw_data.isna().sum()
ratio_nan = nan_attribute / raw_data.shape[0]
print('Ratio of missing values in attributes', ratio_nan.to_string(), sep='\n')

In [None]:
duplicates = raw_data.duplicated().sum()
ration_dup = duplicates / raw_data.shape[0]
print(f'Total of elements duplicateds: {duplicates}\nRatio of all data-set: {ration_dup:.2%}')

**Si bien los datos son duplicados, hay que recordar que son datos anonimizados, cada uno es una lectura de una victima de trata de humanos**

### Descriptive stats data type

There are two type of attributes in the data set, _nominal_ and _numeric_.
The _numeric_ attributes are mostly _ordinal_ binary used to indicat if the attribute is present in a record, by the other side, _nominal_ attributes where converted to a __Categorical__ data type of the package of _pandas_

In [None]:
# Numeric attributes description
raw_data.describe(exclude=['category', 'O'])

In [None]:
# Nominal attributes
raw_data.describe(include=['category'])

In [None]:
raw_data.median(numeric_only=True)

In [None]:
raw_data.columns

In [None]:
ax = raw_data['ageBroad'].value_counts().plot.bar()

for i, bar in enumerate(ax.patches):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 300,
            f"{raw_data['ageBroad'].value_counts()[i]}:00",
            horizontalalignment= 'center', verticalalignment='bottom',
            fontsize=10, rotation=90, color='black')

ax.set_xlabel("Edades")
ax.set_ylabel("Frecuencia")
ax.margins(x=0, y=0.3)

plt.show()

In [70]:
raw_data.columns

Index(['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
       'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
       'citizenship', 'meansOfControlDebtBondage',
       'meansOfControlTakesEarnings', 'meansOfControlRestrictsFinancialAccess',
       'meansOfControlThreats', 'meansOfControlPsychologicalAbuse',
       'meansOfControlPhysicalAbuse', 'meansOfControlSexualAbuse',
       'meansOfControlFalsePromises', 'meansOfControlPsychoactiveSubstances',
       'meansOfControlRestrictsMovement', 'meansOfControlRestrictsMedicalCare',
       'meansOfControlExcessiveWorkingHours', 'meansOfControlUsesChildren',
       'meansOfControlThreatOfLawEnforcement',
       'meansOfControlWithholdsNecessities',
       'meansOfControlWithholdsDocuments', 'meansOfControlOther',
       'meansOfControlNotSpecified', 'meansOfControlConcatenated',
       'isForcedLabour', 'isSexualExploit', 'isOtherExploit', 'isSexAndLabour',
       'isForcedMarriage', 'isForcedMilitary', 'isOrganRemova

In [81]:
raw_data_woCat = raw_data.drop(axis=1,columns=['meansOfControlConcatenated',
                                               'typeOfExploitConcatenated',
                                               'typeOfLabourConcatenated',
                                               'typeOfSexConcatenated',
                                               'RecruiterRelationship'])

categorical_columns = ['ageBroad','Datasource','gender','majorityStatus','majorityStatusAtExploit','majorityEntry','citizenship',
                       'CountryOfExploitation']
data_encode = pd.get_dummies(raw_data_woCat, columns = categorical_columns)

correlation_matrix = data_encode.corr()

print(correlation_matrix)

                                        yearOfRegistration  \
yearOfRegistration                                1.000000   
meansOfControlDebtBondage                        -0.024488   
meansOfControlTakesEarnings                      -0.023563   
meansOfControlRestrictsFinancialAccess            0.182551   
meansOfControlThreats                             0.037558   
...                                                    ...   
CountryOfExploitation_US                          0.519121   
CountryOfExploitation_UZ                         -0.065614   
CountryOfExploitation_VN                          0.003556   
CountryOfExploitation_VU                          0.022378   
CountryOfExploitation_ZA                         -0.010293   

                                        meansOfControlDebtBondage  \
yearOfRegistration                                      -0.024488   
meansOfControlDebtBondage                                1.000000   
meansOfControlTakesEarnings                     

In [None]:
raw_data_woCat['ageBroadID'] = raw_data_woCat['ageBroad'].cat.codes

In [117]:

corr_matrix = raw_data_woCat[['meansOfControlDebtBondage',
                            'meansOfControlTakesEarnings', 'meansOfControlRestrictsFinancialAccess',
                            'meansOfControlThreats', 'meansOfControlPsychologicalAbuse']]\
                            .corrwith(raw_data_woCat['ageBroadID'].astype('Int32'))

AttributeError: 'float' object has no attribute 'shape'

In [116]:
raw_data_woCat['ageBroadID'].astype('Int32').info()

<class 'pandas.core.series.Series'>
RangeIndex: 97750 entries, 0 to 97749
Series name: ageBroadID
Non-Null Count  Dtype
--------------  -----
97750 non-null  Int32
dtypes: Int32(1)
memory usage: 477.4 KB


In [110]:
raw_data_woCat[['meansOfControlDebtBondage','meansOfControlTakesEarnings', 'meansOfControlRestrictsFinancialAccess',
                            'meansOfControlThreats', 'meansOfControlPsychologicalAbuse']].dtypes
                            

meansOfControlDebtBondage                 Int32
meansOfControlTakesEarnings               Int32
meansOfControlRestrictsFinancialAccess    Int32
meansOfControlThreats                     Int32
meansOfControlPsychologicalAbuse          Int32
dtype: object

In [118]:
df1 = raw_data_woCat[['meansOfControlDebtBondage','meansOfControlTakesEarnings', 'meansOfControlRestrictsFinancialAccess',
                            'meansOfControlThreats', 'meansOfControlPsychologicalAbuse']]
df2 = raw_data_woCat['ageBroadID']
df1.corrwith(df2)

AttributeError: 'float' object has no attribute 'shape'

## PENDIENTE 
- CODIGOS PARA CATEGORICAS
- CODIGOS ISO PARA PAISES