# Import Modules
This section imports the required modules and prepare the raw content of the file in the desired
data type for performance

In [None]:
import re
from zipfile import ZipFile

import pycountry
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from pandas.api.types import CategoricalDtype

pio.templates.default = 'plotly_dark'


### Read file

In [None]:
# Read csv file zipped
zip_path = './data/human-trafficking-victims-dataset-ctdc.zip'
data_file = None
raw_data = None
with ZipFile(zip_path) as zip_file:
    for compressed in zip_file.filelist:
        if compressed.filename.endswith('csv'):
            with zip_file.open(compressed) as data_file:
                raw_data = pd.read_csv(
                    data_file,
                    sep=';',
                    header=0,
                    low_memory=False,
                    usecols=range(1,64),)


### Data type conversion
Conversion of attributes read as the default string type value read from raw data to Categorical object of pandas</br>
to simplify the operations for those categorical variables

In [None]:
# Setting values of categorical attribute
# for performance at processing time
# Age numerical categories
age_range = [
    '0--8',
    '9--17',
    '18--20',
    '21--23',
    '24--26',
    '27--29',
    '30--38',
    '39--47',
    '48+'
]
# Gender categories
gender_values = [
    'Male',
    'Female',
    'Transgender/NonConforming',
]
# Age status categories
age_cate = [
    'Adult',
    'Minor',
]
# Data recorders categories
data_source = [
    'Case Management',
    'Hotline'
]
# Categorical pandas objects
age_cat_range = CategoricalDtype(categories=age_range, ordered=True)
gender_cat = CategoricalDtype(categories=gender_values, ordered=False)
age_cat = CategoricalDtype(categories=age_cate, ordered=True)
data_cat_source = CategoricalDtype(categories=data_source, ordered=False)


In [None]:
# Convert all values read as float due to NaN
raw_data = raw_data.apply(lambda series: series.astype('Int32', errors='ignore'))
# Convert to categorical types
raw_data['Datasource'] = raw_data['Datasource'].astype(data_cat_source)
raw_data['gender'] = raw_data['gender'].astype(gender_cat)
raw_data['ageBroad'] = raw_data['ageBroad'].astype(age_cat_range)

columns_majority = ['majorityStatus', 'majorityStatusAtExploit', 'majorityEntry']
raw_data[columns_majority] = raw_data[columns_majority].astype(age_cat)


### Miscellaneous

In [None]:
# Categorical objects attributes
categorical_columns = [
    'ageBroad',
    'Datasource',
    'gender',
    'majorityStatus',
    'majorityStatusAtExploit',
    'majorityEntry']

# Categorical countries
country_columns = [
    'citizenship',
    'CountryOfExploitation']

# Concatenated categorical categories
concatenated_columns = [
    'meansOfControlConcatenated',
    'typeOfExploitConcatenated',
    'typeOfLabourConcatenated',
    'typeOfSexConcatenated',
    'RecruiterRelationship']

# Function to convert country to any ISO country format
def country_conversion(country, code='numeric'):
    try:
        return getattr(pycountry.countries.lookup(country), code)
    except LookupError:
        # It is a missing data and cannot be convert 'NaN'
        # value of country to numeric code
        return -1 if code == 'numeric' else 'Unknown'
    except ArithmeticError as att:
        # In case of the code requested is wrong
        raise ArithmeticError(
            f'The City object from "pycountry" does not have Attribute {code}') from att


### Integrity

In [None]:
majority_permutation = raw_data[columns_majority].value_counts(dropna=False).reset_index()#.sort_values(columns_majority)
# Index of records can be complete 6, 5, 2, 8, 3, 15, 11, 16, 9, 17
# Index of incongruent records combination 19, 18, 21
majority_permutation


In [None]:
# Dropping Incongruente due to anonymization
# Total of records to drop 12
indexes = [
    # majorityStatus = Adult  majorityStatusAtExploit = Minor	majorityEntry = Adult
    raw_data.query('majorityStatus == @age_cate[0] & majorityStatusAtExploit == @age_cate[1] & majorityEntry == @age_cate[0]').index,
    # majorityStatus = Minor  majorityStatusAtExploit = NaN	majorityEntry = Adult
    raw_data.query('majorityStatus == @age_cate[1] & majorityStatusAtExploit.isna() & majorityEntry == @age_cate[0]').index,
    # majorityStatus = NaN  majorityStatusAtExploit = Minor	majorityEntry = Adult
    raw_data.query('majorityStatus.isna() & majorityStatusAtExploit == @age_cate[1] & majorityEntry == @age_cate[0]').index,
]

drop_index = pd.Index([])
for index in indexes:
    drop_index = drop_index.join(index, how='outer')
raw_data.drop(drop_index, inplace=True)


In [None]:
# Completing missing data due to anonymization
# Total of records to fill 26778
indexes = [
    # majorityStatus = Adult  majorityStatusAtExploit = Minor	majorityEntry = NaN
    (raw_data.query('majorityStatus == @age_cate[0] & majorityStatusAtExploit == @age_cate[1] & majorityEntry.isna()').index, 'Minor', 'majorityEntry'),
    # majorityStatus = Adult  majorityStatusAtExploit = NaN	majorityEntry = Adult
    (raw_data.query('majorityStatus == @age_cate[0] & majorityStatusAtExploit.isna() & majorityEntry == @age_cate[0]').index, 'Adult', 'majorityStatusAtExploit'),
    # majorityStatus = Minor  majorityStatusAtExploit = Minor	majorityEntry = NaN
    (raw_data.query('majorityStatus == @age_cate[1] & majorityStatusAtExploit == @age_cate[1] & majorityEntry.isna()').index, 'Minor', 'majorityEntry'),
    # majorityStatus = Minor  majorityStatusAtExploit = NaN	majorityEntry = Minor
    (raw_data.query('majorityStatus == @age_cate[1] & majorityStatusAtExploit.isna() & majorityEntry == @age_cate[1]').index, 'Minor', 'majorityStatusAtExploit'),
    # majorityStatus = Minor  majorityStatusAtExploit = NaN	majorityEntry = NaN
    (raw_data.query('majorityStatus == @age_cate[1] & majorityStatusAtExploit.isna() & majorityEntry.isna()').index, 'Minor', ['majorityStatusAtExploit', 'majorityEntry']),

    # majorityStatus = NaN  majorityStatusAtExploit = Adult	majorityEntry = Adult
    (raw_data.query('majorityStatus.isna() & majorityStatusAtExploit == @age_cate[0] & majorityEntry == @age_cate[0]').index, 'Adult', 'majorityStatus'),
    # majorityStatus = NaN  majorityStatusAtExploit = Adult	majorityEntry = NaN
    (raw_data.query('majorityStatus.isna() & majorityStatusAtExploit == @age_cate[0] & majorityEntry.isna()').index, 'Adult', ['majorityStatus', 'majorityEntry']),
    # majorityStatus = NaN  majorityStatusAtExploit = Minor	majorityEntry = Minor
    (raw_data.query('majorityStatus.isna() & majorityStatusAtExploit == @age_cate[1] & majorityEntry == @age_cate[1]').index, 'Minor', 'majorityStatus'),
    # majorityStatus = NaN  majorityStatusAtExploit = Minor	majorityEntry = NaN
    (raw_data.query('majorityStatus.isna() & majorityStatusAtExploit == @age_cate[1] & majorityEntry.isna()').index, 'Minor', ['majorityStatus', 'majorityEntry']),
    # majorityStatus = NaN  majorityStatusAtExploit = NaN	majorityEntry = Adult
    (raw_data.query('majorityStatus.isna() & majorityStatusAtExploit.isna() & majorityEntry == @age_cate[0]').index, 'Adult', ['majorityStatus', 'majorityEntry']),
]

for index, value, columns in indexes:
    raw_data.loc[index, columns] = value


# Exploratory analysis
Statistical exploratory analysis for the raw dataset, the objective of this first step is discover basic overview of the behavior,</br>
trend, relationships, missing and duplicated values. In addition to this, graphic presentations will be used for the better</br>
understanding the attributes of dataset.

In [None]:
raw_data.head()


In [None]:
raw_data.info()


In [None]:
print(f'The data-set has a total of records and attributes {raw_data.shape}\n')
print('Count total of data types in data-set')
unique_types = raw_data.dtypes.value_counts()
print(unique_types.to_string())


In [None]:
# NaN values per record in data-set
nan_record = raw_data.isna().sum(axis=1).describe()
print('Missing "NaN" attributes per record')
print(nan_record.to_string(header=True))


In [None]:
# NaN values per attribute in data-set
nan_attribute = raw_data.select_dtypes(exclude=['O']).isna().sum()
ratio_nan = nan_attribute / raw_data.shape[0]
print('Ratio of missing values in attributes', ratio_nan.to_string(), sep='\n')


**Given the nature of the dataset, the possible duplicated records are due to the anonymize preprocessed that was made it by the _CTDC_**</br>
and there are not going to be removed from the dataset, for the possible information that those records can subscribe.

In [None]:
duplicates = raw_data.duplicated().sum()
ration_dup = duplicates / raw_data.shape[0]
print(f'Total of elements duplicated: {duplicates}\nRatio of all dataset: {ration_dup:.2%}')


### Data Types

There are two type of attributes in the data set, _nominal_ and _numeric_.

The _numeric_ attributes are mostly _ordinal_ binary used to represent if the attribute is present in a record, on the other side,</br> _nominal_ attributes where converted to a __Categorical__ data type of the package of _pandas_

In [None]:
# Numeric attributes description
raw_data.describe(exclude=['category', 'O'])


In [None]:
# Nominal attributes
raw_data.describe(include=['category'])


### Plots
Graphical description of the attributes and the relations between each other, the plot used to describe the data are _bar_, _heatmap_, _box_ and _pie_</br>
The objective is observe potential trends, frequencies and relationships between the attributes of the dataset.

In [None]:
map_raw = raw_data[country_columns[0]].dropna().apply(country_conversion, code='alpha_3')
map_raw = map_raw.value_counts().reset_index(name='records')
map_raw.sort_values('records', ascending=False)
fig = px.choropleth(
    map_raw,
    locations=country_columns[0],
    color='records',
    hover_name=map_raw[country_columns[0]].apply(country_conversion, code='name'),
    hover_data={
        'citizenship': False,
    },
    color_continuous_scale="burgyl",
    projection='natural earth',
    title='Heatmap<br>Citizenship',
    template='plotly_white'
)
fig.update_layout(
    height=600,
    width=1000,)
fig.show()


In [None]:
# Heat map of the country where exploitations occur
map_raw = raw_data[country_columns[1]].dropna().apply(country_conversion, code='alpha_3')
map_raw = map_raw.value_counts().reset_index(name='records')
map_raw.sort_values('records', ascending=False)

fig = px.choropleth(
    map_raw,
    locations=country_columns[1],
    color='records',
    hover_name=map_raw[country_columns[1]].apply(country_conversion, code='name'),
    hover_data={
        'CountryOfExploitation': False,
    },
    color_continuous_scale='burgyl',
    projection='natural earth',
    title='Heatmap<br>Country of explotation',
    template='plotly_white'
)
fig.update_layout(
    height=600,
    width=1000,)
fig.show()


In [None]:
# Gender of the individuals group by the registration year
gender_frequency = raw_data[['yearOfRegistration', 'gender']].groupby(
    ['yearOfRegistration', 'gender']).size().reset_index(level=1, name='count')
# Area bar per year
gender_area = px.area(
    gender_frequency,
    x=gender_frequency.index,
    y='count',
    color='gender',
    title='Records registered per Year<br>Group by <i>gender<i>',
    hover_name=gender_frequency.index,
    hover_data={
        'gender': False
    },
    labels={'yearOfRegistration': 'Year', 'count': 'Total/Year'},
    category_orders={'gender': ['Transgender/NonConforming', 'Male', 'Female']},)

gender_area.update_traces(
    hovertemplate='<b>%{hovertext}</b><br><br>Total records: %{y}')
gender_area.show()


In [None]:
# Count the gender of the records
gender_count = raw_data['gender'].value_counts(dropna=False).reset_index(name='count')
# Pie plot
gender_pie = px.pie(
    gender_count,
    names='gender',
    values='count',
    title='Gender total frequency<br><i>with missing values<i>',
    width=800,
    hover_name='gender',
    hover_data={'gender': False, 'count': False})

gender_pie.update_traces(
    textinfo='percent+value',
    hoverinfo='label+percent')
gender_pie.show()


In [None]:
# Age broad grouped by the gender of the record
broad_gender = raw_data[['gender', 'ageBroad']].copy(deep=True)\
    .groupby(['gender', 'ageBroad'], dropna=False)\
    .size().reset_index(name='count')
# Taking into account the missing values in raw data
broad_gender[['gender', 'ageBroad']] = broad_gender[['gender', 'ageBroad']].astype('str')
broad_gender['gender'].replace('nan', 'Gender Missing', inplace=True)
broad_gender['ageBroad'].replace('nan', 'Missing', inplace=True)

# Bar plot
broad_gender_bar = px.bar(
    broad_gender,
    x='ageBroad',
    y='count',
    color='gender',
    title='Distribution of <b>age broad</b><br>Group by <i>gender</i>',
    text='count',
    text_auto='.3s',
    hover_name='gender'
)
broad_gender_bar.update_traces(
    hovertemplate='<b>%{hovertext}</b><br><br>Total of records: %{y}<extra></extra>'
)
broad_gender_bar.show()


In [None]:
majority = raw_data[columns_majority].apply(pd.Series.value_counts).T\
    .unstack().reset_index(name='count')\
    .rename(columns={'level_0': 'majority', 'level_1': 'status'})

majority_bar = px.bar(
    majority,
    x='majority',
    y='count',
    color='status',
    barmode='group',
    hover_name='status',
    text='count',
    text_auto='.2s',
)
majority_bar.update_traces(
    textangle=-45,
    textposition='outside',
    cliponaxis=False,
    hovertemplate='<b>%{hovertext}</b><br><br>Total of records: %{y:d}<extra></extra>'
)
majority_bar.show()


In [None]:
count_totals = raw_data.filter(regex='(Concatenated|Relationship)$')
count_totals = count_totals.applymap(lambda x: len(x.split(';')), na_action='ignore').fillna(0).astype('int32')
count_totals.rename(columns=lambda x: re.sub(r'(Concatenated)?$', 'Count', x, count=1), inplace=True)


In [None]:
axis = count_totals.boxplot(
    fontsize=8,
    grid=False,
    figsize=(10, 6),
    showcaps=True,
    flierprops={'marker': '.', 'markersize': 3},
    showmeans=True,
    meanline=True,)
axis.set_ylim(-1)
title = axis.set_title('Total count of subcategories')


In [None]:
axis = count_totals.plot.kde()


## Preprocessing

### Dimensionality reduction

In [None]:
# Tomamos las variables cualitativas del dataset junto con las variables que nos sirven como indexación
dataset_columns = [ 'yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
       'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
       'citizenship', 'meansOfControlConcatenated','typeOfExploitConcatenated', 'typeOfLabourConcatenated',
       'typeOfSexConcatenated', 'isAbduction', 'RecruiterRelationship',
       'CountryOfExploitation', 'recruiterRelationIntimatePartner',
       'recruiterRelationFriend', 'recruiterRelationFamily',
       'recruiterRelationOther', 'recruiterRelationUnknown']

# Generamos un nuevo dataset con esta reducción de dimensionalidad
new_data_set = raw_data.loc[:,dataset_columns]


In [None]:
columns_to_check = ['meansOfControlConcatenated','typeOfExploitConcatenated', 'typeOfLabourConcatenated',
       'typeOfSexConcatenated']


new_data_set[columns_to_check] = new_data_set[columns_to_check].replace(0, np.nan)
# Filtrar los registros con datos faltantes solo en las columnas mencionadas
filtered_data = new_data_set[new_data_set[columns_to_check].isnull().all(axis=1)]


In [None]:
filtered_data.shape[0] / new_data_set.shape[0] * 100


In [None]:
# Calcular cuántos registros eliminar (90%)
num_records_to_delete = int(len(filtered_data) * 0.9)

# Eliminar el 90% de los registros sin datos
filtered_data = filtered_data.sample(n=num_records_to_delete, random_state=42)

# Restaurar los registros eliminados a new_data_set
new_data_set = new_data_set[~new_data_set.index.isin(filtered_data.index)]

# Imputación de datos faltantes utilizando la moda (valor más común)
imputer = SimpleImputer(strategy='most_frequent')

# Aplicar la imputación solo a las columnas de interés
new_data_set[columns_to_check] = imputer.fit_transform(new_data_set[columns_to_check])

# Verificar el resultado
print(new_data_set.isnull().sum())

# Realizar muestras visuales para verificar la imputación
sample_data = new_data_set.sample(10)  # Muestra aleatoria de 10 registros
print(sample_data[columns_to_check])


In [None]:
# Lista de campos que necesitan one-hot encoding
fields_to_encode = ['meansOfControlConcatenated', 'typeOfExploitConcatenated', 'typeOfLabourConcatenated', 'typeOfSexConcatenated']

# Itera sobre cada campo y aplica one-hot encoding
for field in fields_to_encode:
    # Divide el campo en múltiples columnas one-hot
    one_hot_encoded = new_data_set[field].str.get_dummies(sep=';')

    # Renombra las columnas para que sean únicas
    one_hot_encoded.columns = [f"{field}_{column}" for column in one_hot_encoded.columns]

    # Concatena las columnas one-hot al nuevo dataset
    new_data_set = pd.concat([new_data_set, one_hot_encoded], axis=1)

# Elimina las columnas originales que ya han sido one-hot encoded
new_data_set = new_data_set.drop(fields_to_encode, axis=1)


In [None]:
new_data_set.shape
