## Puertos de USA

In [1]:
import pandas as pd
import numpy as np

In [2]:
# List of CSV files to import
csv_files = [
    '../sources/CBP_drug_seizures/nationwide-drugs-fy19-fy22.csv',
    '../sources/CBP_drug_seizures/nationwide-drugs-fy20-fy23.csv',
    '../sources/CBP_drug_seizures/nationwide-drugs-fy21-fy24.csv',
    '../sources/CBP_drug_seizures/nationwide-drugs-fy22-fy25-dec.csv'
]

# Initialize an empty list to store dataframes
drug_seizures_dataframes = []


### Puerto de Los Angeles y Long Beach
Alternativas: Seattle/Tacoma, Oakland/San Francisco, Savannah, Miami/Fort Lauderdale, Houston/Galveston, NY/NJ

Viendo un poco los puertos con más tráfico de barcos (y de contenedores), tiene sentido trabajar con los puertos mencionados


Importar datos de Long Beach:

In [None]:
# Importar datos de 'Stats - TEU Archive Since 1995 - NEW.xls'
teu_data_long_beach = pd.read_excel('../sources/Stats - TEU Archive Since 1995 - NEW.xls', header=0, nrows=76)
teu_data_long_beach['Date'] = pd.to_datetime(teu_data_long_beach['Date'], format='%b %Y').dt.strftime('%Y-%m')

teu_data_long_beach.drop(1, inplace=True)
teu_data_long_beach.reset_index(drop=True, inplace=True)

Importar datos del Puerto de Los Ángeles:

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
headers = ['Date', 'Loaded Imports', 'Empty Imports','Total Imports', 'Loaded Exports', 'Empty Exports', 'Total Exports', 'Total TEUs','Prior Year Change']

In [None]:
# List of URLs to scrape
urls = [
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2024',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2023',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2022',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2021',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2020',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2019',
    'https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2018'
]

# Initialize an empty list to store dataframes
dataframes = []

for url in urls:
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table container
    table_container = soup.find('div', class_='table-container')
    
    # Extract the table rows
    rows = table_container.find_all('tr')
    
    # Extract the table headers
    # headers = [header.text.strip() for header in rows[0].find_all('th')]
    
    # Extract the table data
    data = []
    for row in rows[1:]:
        cols = row.find_all('td')
        data.append([col.text.strip() for col in cols])
    
    # Create a pandas dataframe
    df = pd.DataFrame(data, columns=headers)
    df.drop(columns=['Prior Year Change'], inplace=True)
    df['Date'] = df['Date'] + '_' + url.split('-')[-1]
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
teu_data_los_angeles = pd.concat(dataframes, ignore_index=True)

In [None]:
teu_data_los_angeles = teu_data_los_angeles[~teu_data_los_angeles['Date'].str.contains('Total Calendar Year|Total Fiscal Year')]
teu_data_los_angeles.reset_index(drop=True, inplace=True)
indices_to_drop = [12,25,38,51,64,77,90]
teu_data_los_angeles.drop(indices_to_drop, inplace=True)
teu_data_los_angeles.reset_index(drop=True, inplace=True)
teu_data_los_angeles['Date'] = pd.to_datetime(teu_data_los_angeles['Date'], format='%B_%Y').dt.strftime('%Y-%m')

In [None]:
# Convert columns to float
columns_to_convert = ['Loaded Imports', 'Empty Imports', 'Total Imports', 'Loaded Exports', 'Empty Exports', 'Total Exports', 'Total TEUs']

for col in columns_to_convert:
    # Remove any commas that might be present in the numbers
    teu_data_los_angeles[col] = teu_data_los_angeles[col].astype(str).apply(
        lambda x: x.replace('.', '_').replace('_', '', x.count('_')-1).replace('_', '.') if x.count('.') > 1 else x
    )
    # Then remove any commas and convert to float
    teu_data_los_angeles[col] = teu_data_los_angeles[col].str.replace(',', '').astype(float)

teu_data_los_angeles.sort_values(by='Date', inplace=True, ascending=False)
teu_data_los_angeles.reset_index(drop=True, inplace=True)
teu_data_los_angeles.drop(list(range(75,len(teu_data_los_angeles))), inplace=True)

In [None]:
teu_data_los_angeles.sort_values(by='Date', inplace=True)
teu_data_long_beach.sort_values(by='Date', inplace=True)

teu_data_los_angeles.reset_index(drop=True, inplace=True)
teu_data_long_beach.reset_index(drop=True, inplace=True)

combined_teu_data = pd.DataFrame({
    'Date': teu_data_los_angeles['Date'],
    'Loaded Imports': teu_data_los_angeles['Loaded Imports'] + teu_data_long_beach['Loaded Inbound'],
    'Empty Imports': teu_data_los_angeles['Empty Imports'] + teu_data_long_beach['Empty Inbound'],
    'Total Imports': teu_data_los_angeles['Total Imports'] + teu_data_long_beach['Loaded Inbound'] + teu_data_long_beach['Empty Inbound'],
    'Loaded Exports': teu_data_los_angeles['Loaded Exports'] + teu_data_long_beach['Loaded Outbound'],
    'Empty Exports': teu_data_los_angeles['Empty Exports'] + teu_data_long_beach['Empty Outbound'],
    'Total Exports': teu_data_los_angeles['Total Exports'] + teu_data_long_beach['Loaded Outbound'] + teu_data_long_beach['Empty Outbound'],
    'Total TEUs': teu_data_los_angeles['Total TEUs'] + teu_data_long_beach['Total']
})

- Revisar si los datos de TEU son en unidades o en miles de unidades u otro
- Inbound = Importaciones, Outbound = Exportaciones
- Asegurarse de que se tienen 75 registros en ambos dataframes y sumarlos
- Más adelante se podrían incluir las coordenadas (en caso de incluir otros hubs portuarios)

Incluir datos de hub_porturario + drug_seizures

In [None]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Filter the dataframe for 'Los Angeles Field Office'
    df_filtered = df[df['Area of Responsibility'] == 'LOS ANGELES FIELD OFFICE']
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df_filtered)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'LOS ANGELES FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

In [None]:
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)'], inplace=True)
drug_seizures_combined.drop(columns=['Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date']).dt.strftime('%Y-%m')

In [None]:
subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]

In [None]:
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index='Date', columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)

In [None]:
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Combinar datos de cargamentos en los puertos del Área de Los Ángeles con las incautaciones de drogas:

In [None]:
GLA_data = pd.merge(combined_teu_data, drug_seizures_wide, on='Date', how='inner')

Exportar el dataframe creado

In [None]:
GLA_data.to_csv('../sources/GreaterLosAngeles_data.csv', index=False, sep=',')

In [None]:
GLA_data['Computed_Value'] = abs((GLA_data['Total Imports'] + GLA_data['Total Exports']) - GLA_data['Total TEUs'])
GLA_data[['Date', 'Computed_Value']].sort_values(by='Computed_Value', ascending=False).head(5)
GLA_data.drop(columns=['Computed_Value'], inplace=True)

### Importar datos Gran Los Angeles

In [3]:
GLA_data = pd.read_csv('../sources/GreaterLosAngeles_data.csv', sep=',')

Recuperar el dataframe original, añadir columna 'Los Angeles' y sus coordenadas (es posible considerar incluir la población)

In [4]:
us_port_hubs = pd.read_csv('../sources/ais_noaa_gov/us_port_hubs.csv')
us_port_hubs.drop(columns=['Unnamed: 0'], inplace=True)
df_port_drugs = GLA_data.copy()

In [5]:
# GLA_data.drop(columns=['Computed_Value'], inplace=True)
GLA_data['latitude'] = us_port_hubs.loc[3, 'coord_0']
GLA_data['longitude'] = us_port_hubs.loc[3, 'coord_1']

### Visualizacion de los datos
- Limpieza de datos.
- Transformación, Normalización de los datos.Integración.
- Missing Values.
- Outliers & Noise Identification.

#### Representacion Grafica de variables y comparacion entre variables:

En GLA_data existia un error de formato en la observación 25, por lo que, tras arreglarlo más bajo, es necesario volver a ejecutar las celdas de transformación:

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Ensure 'Date' is in datetime format
GLA_data['Date'] = pd.to_datetime(GLA_data['Date'])

# Identify numeric and categorical columns (excluding 'Date')
numeric_features = GLA_data.select_dtypes(include=['float64']).columns
categorical_features = GLA_data.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Exclude 'Date' from transformations
features_to_transform = [col for col in GLA_data.columns if col not in ['Date']]

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data (excluding 'Date')
GLA_data_transformed = pipeline.fit_transform(GLA_data[features_to_transform])

# Reconstruct DataFrame
transformed_columns = list(numeric_features)
if len(categorical_features) > 0:
    transformed_columns += list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

GLA_data_transformed_df = pd.DataFrame(GLA_data_transformed, columns=transformed_columns)

# Reattach 'Date' column
GLA_data_transformed_df.insert(0, 'Date', GLA_data['Date'].values)


Se ha corroborado que (Sum Imports + Sum Exports) = TEUs Total se cumple en la mayoría de los casos.
Falla para el índice 25 (noviembre-2020) que es el dato tratado. Puesto que el resultado del dataframe resultante es la suma de los datos ofrecidos tanto por el Puerto de Los Angeles como el de Long Beach, se va acudir a las fuentes originarias para observar si el error está originado allí:
- https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2020:
292,762.25 	423,678.75 	889.,748.15 -> Hay un error de formato en la columna TEUs.
- Para el caso de Long Beach, el formato es correcto.


In [7]:
# Dado que ahora conocemos tanto el origen del error como el valor resultante, se va modificar el valor de 'Total TEUs' en la fila 25:
GLA_data.loc[25, 'Total TEUs'] = GLA_data['Total Imports'].iloc[25] + GLA_data['Total Exports'].iloc[25]

In [None]:
from plotnine import ggplot, aes, geom_line, labs, theme_minimal

# Melt the dataframe to long format for easier plotting with plotnine
GLA_data_melted = GLA_data_transformed_df.melt(id_vars=['Date'], value_vars=['Loaded Imports', 'Sum_of_Counts', 'Total Exports', 'Total Imports', 'Total TEUs'], var_name='Variable', value_name='Value')

# Create the plot
plot = (ggplot(GLA_data_melted, aes(x='Date', y='Value', color='Variable'))
    + geom_line()
    + labs(title='Time Series of Various Metrics',
           x='Date',
           y='Value')
    + theme_538())

plot

A primera vista, se observa un más que probable outlier en 'Total TEUs'. Se van realizar distintas visualizaciones para esta variable: boxplot, diagrama scatter...

In [None]:
from plotnine import ggplot, aes, geom_boxplot, geom_point, geom_histogram, stat_qq, stat_qq_line, labs, theme_minimal

# Boxplot for 'Total TEUs'
boxplot = (ggplot(GLA_data_transformed_df, aes(x=1, y='Total TEUs'))
           + geom_boxplot()
           + labs(title='Boxplot of Total TEUs', x='', y='Total TEUs')
           + theme_538())

boxplot

In [None]:

# QQ plot for 'Total TEUs'
qq_plot = (ggplot(GLA_data_transformed_df, aes(sample='Total TEUs'))
           + stat_qq()
           + stat_qq_line()
           + labs(title='QQ Plot of Total TEUs', x='Theoretical Quantiles', y='Sample Quantiles')
           + theme_538())

qq_plot

In [None]:
# Histogram for 'Total TEUs'
histogram = (ggplot(GLA_data_transformed_df, aes(x='Total TEUs'))
             + geom_histogram(bins=30, fill='blue', color='black')
             + labs(title='Histogram of Total TEUs', x='Total TEUs', y='Frequency')
             + theme_538())

histogram

Queda claro que ese valor es un outlier. ¿Pero qué hacer? ¿Cómo se ha llegado hasta él? No hay muchos datos y quizá eliminar toda esa fila de datos no fuera necesario. Así que voy a ver si originalmente existía tal dato y cómo se llegó a él.

In [None]:
# Find the index of the maximum value in 'Total TEUs'
max_teus_index = GLA_data['Total TEUs'].idxmax()

# Get the 'Date' value for this index
max_teus_date = GLA_data.loc[max_teus_index, 'Date']

# Print the index and the 'Date' value
print(f"Index: {max_teus_index}, Date: {max_teus_date}")

Tras arreglar ese dato, se va a proceder a tratar los outliers en el resto de variables:

In [None]:
GLA_data_transformed_df.to_csv('../sources/GreaterLosAngeles_data_transformed.csv', index=False, sep=',')

### Puertos de Seattle y Tacoma:

In [8]:
# Read the Excel file and create a dataframe using the third row as the header
nwsa_teu_data = pd.read_excel('../sources/NWSA Historical Cargo Stats.xls', sheet_name='TEUs', header=2)

# Drop specific rows
rows_to_drop = [11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91] + list(range(99, len(nwsa_teu_data)))
nwsa_teu_data.drop(rows_to_drop, inplace=True)


nwsa_teu_data['Unnamed: 0'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  nwsa_teu_data['Unnamed: 0'].fillna(method='ffill', inplace=True)
  nwsa_teu_data['Unnamed: 0'].fillna(method='ffill', inplace=True)


In [9]:
# Melt the dataframe to long format
nwsa_teu_data_long = nwsa_teu_data.melt(id_vars=['Unnamed: 0', 'Unnamed: 1'], value_vars=[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], 
                                        var_name='Year', value_name='TEUs')
# Rename 'Unnamed: 0' to 'Category' and 'Unnamed: 1' to 'Subcategory'
nwsa_teu_data_long.rename(columns={'Unnamed: 0': 'Category', 'Unnamed: 1': 'Subcategory'}, inplace=True)
rows_to_remove = nwsa_teu_data_long.loc[nwsa_teu_data_long["Category"] == "Total"]

# Remove rows using the indices from rows_to_remove
nwsa_teu_data_long.drop(rows_to_remove.index, inplace=True)

# Reset the index after dropping rows
nwsa_teu_data_long.reset_index(drop=True, inplace=True)

# Merge column 'Year' with 'Category' and call it 'Date'
nwsa_teu_data_long['Date'] = pd.to_datetime(nwsa_teu_data_long['Year'].astype(str) + '-' + nwsa_teu_data_long['Category'], format='%Y-%b')

# Drop columns 'Category' and 'Year'
nwsa_teu_data_long.drop(columns=['Category', 'Year'], inplace=True)

# Transform Date type to format '%Y-%m'
nwsa_teu_data_long['Date'] = nwsa_teu_data_long['Date'].dt.strftime('%Y-%m')

# Drop rows with NaN values in the 'Subcategory' column
nwsa_teu_data_long.dropna(subset=['Subcategory'], inplace=True)

# Pivot the dataframe
nwsa_teu_data_pivot = nwsa_teu_data_long.pivot(index='Date', columns='Subcategory', values='TEUs').reset_index()


In [10]:
# Define the columns to sum
columns_to_sum = ['Domestic - Alaska', 'Domestic - Hawaii', 'International Exports Full', 'International Imports Empty', 'International Imports Full']

# Calculate the sum of the specified columns
nwsa_teu_data_pivot['Sum_Other_Columns'] = nwsa_teu_data_pivot[columns_to_sum].sum(axis=1)

# Impute NaN values in 'International Exports Empty'
nwsa_teu_data_pivot['International Exports Empty'] = nwsa_teu_data_pivot.apply(
    lambda row: row['Total TEUs'] - row['Sum_Other_Columns'] if pd.isna(row['International Exports Empty']) else row['International Exports Empty'],
    axis=1
)

# Drop the temporary 'Sum_Other_Columns' column
nwsa_teu_data_pivot.drop(columns=['Sum_Other_Columns'], inplace=True)

Incluir datos de incautacion para Seattle:

In [11]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Filter the dataframe for 'Los Angeles Field Office'
    # df_filtered = df[df['Area of Responsibility'] == 'SEATTLE FIELD OFFICE']
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'SEATTLE FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

# Ensure all values in 'FY' column are strings
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].astype(str)

# Replace ' (FYTD)' with an empty string
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].str.replace(' (FYTD)', '', regex=False)

drug_seizures_combined['Date'] = drug_seizures_combined['FY'] + '-' + drug_seizures_combined['Month (abbv)'].astype(str)
drug_seizures_combined['Date'] = drug_seizures_combined['Date'].str.replace(r'-(\w{3})$', lambda x: '-' + x.group(1).capitalize(), regex=True)

# Convert 'Date' column to datetime format and then to '%Y-%m'
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date'], format='%Y-%b').dt.strftime('%Y-%m')

# Drop the original 'FY' and 'Month (abbv)' columns if no longer needed
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)', 'Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)

La sustitución de 'Area of Responsibility' por las coordenadas de latitud y longitud es una suerte de tratamiento de una variable categórica

In [None]:
# row 25: Seattle/Tacoma, WA
drug_seizures_combined['latitude'] = us_port_hubs.loc[25, 'coord_0']
drug_seizures_combined['longitude'] = us_port_hubs.loc[25, 'coord_1']

Unnamed: 0,Drug Type,Count of Event,Sum Qty (lbs),Date,latitude,longitude
0,Cocaine,1,0.003527,2019-04,47.449355,-122.428779
1,Ecstasy,2,22.822253,2019-04,47.449355,-122.428779
2,Lsd,1,0.017152,2019-04,47.449355,-122.428779
3,Marijuana,63,2.769778,2019-04,47.449355,-122.428779
4,Methamphetamine,2,0.03814,2019-04,47.449355,-122.428779


In [13]:
# subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index=['Date','latitude','longitude'], columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Modificar dataframe de contenedores de Seattle:

['Date', 'Loaded Imports', 'Empty Imports', 'Total Imports',
       'Loaded Exports', 'Empty Exports', 'Total Exports', 'Total TEUs'

In [None]:
nwsa_teu_data_pivot.rename(columns={
    'International Imports Full': 'Loaded Imports',
    'International Imports Empty': 'Empty Imports',
    'International Exports Full': 'Loaded Exports',
    'International Exports Empty': 'Empty Exports'
}, inplace=True)

# Create 'Total Imports' and 'Total Exports' columns
nwsa_teu_data_pivot['Total Imports'] = nwsa_teu_data_pivot['Loaded Imports'] + nwsa_teu_data_pivot['Empty Imports']
nwsa_teu_data_pivot['Total Exports'] = nwsa_teu_data_pivot['Loaded Exports'] + nwsa_teu_data_pivot['Empty Exports']

Subcategory,Date,Domestic - Alaska,Domestic - Hawaii,Empty Exports,Loaded Exports,Empty Imports,Loaded Imports,Total TEUs,Total Imports,Total Exports
0,2017-01,42223.0,9030.0,34268.0,77069.5,10304.7,128237.0,301174.2,138541.7,111337.5
1,2017-02,38348.55,7815.0,36038.0,71243.0,7576.5,102697.0,263718.05,110273.5,107281.0
2,2017-03,49332.95,11691.0,39149.0,99603.0,14679.0,120018.0,334472.95,134697.0,138752.0
3,2017-04,46951.0,8828.0,31488.0,77558.0,7126.0,110821.0,282772.0,117947.0,109046.0
4,2017-05,54911.53,9846.0,47445.0,81190.0,14351.0,115837.0,323580.53,130188.0,128635.0


In [15]:
NSWA_data = pd.merge(nwsa_teu_data_pivot, drug_seizures_wide, on='Date', how='inner')
NSWA_data.drop(columns=['Domestic - Alaska', 'Domestic - Hawaii'], inplace=True)

Acoplar el dataframe generado para Seattle con el generado para 'Gran Los Angeles'

In [16]:
# Concatenate NSWA_data with GLA_data
combined_data = pd.concat([NSWA_data, GLA_data], ignore_index=True)
combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y-%m').dt.strftime('%Y-%m')

# Sort the combined dataframe by 'Date' in ascending order
combined_data.sort_values(by='Date', ascending=True, inplace=True)

# Reset the index of the sorted dataframe
combined_data.reset_index(drop=True, inplace=True)

In [17]:
combined_data.loc[combined_data['Date'] == '2024-12']

Unnamed: 0,Date,Empty Exports,Loaded Exports,Empty Imports,Loaded Imports,Total TEUs,Total Imports,Total Exports,latitude,longitude,...,Sum Qty (lbs)_Ecstasy,Sum Qty (lbs)_Fentanyl,Sum Qty (lbs)_Heroin,Sum Qty (lbs)_Ketamine,Sum Qty (lbs)_Khat (Catha Edulis),Sum Qty (lbs)_Lsd,Sum Qty (lbs)_Marijuana,Sum Qty (lbs)_Methamphetamine,Sum Qty (lbs)_Other Drugs**,Sum_of_Counts
148,2024-12,54572.75,65396.0,8174.0,118356.25,304747.69,126530.25,119968.75,47.449355,-122.428779,...,0.063934,,,0.077382,,0.009039,1275.361355,0.009678,2.842376,103.0
149,2024-12,719025.5,209138.75,13622.0,932564.5,1874349.75,946186.5,928164.25,33.804338,-118.379055,...,16.437887,,,15.19955,54.079393,0.019401,898.238874,48.105306,604.207032,215.0


Incluir puerto de San Diego:

Aparentemente los datos en San Diego son muy pequeños para el tráfico de contenedores.
Fuente: https://www.portofsandiego.org/maritime/cargo-and-trade

### Puerto de Oakland en la SFBA:

Nota: The Port of Oakland handles 99% of the containerized goods moving through Northern California
Fuente: https://vitalsigns.mtc.ca.gov/indicators/seaport-activity

In [18]:
# Read the Excel file and create a dataframe
teu_data_oakland = pd.read_excel('../sources/TEUs_PortOakland_2025.xls', header=0)

# Create the 'Date' column
teu_data_oakland['Date'] = pd.concat([
    pd.to_datetime(teu_data_oakland.loc[:26, 'Year'].astype(str) + '-' + teu_data_oakland.loc[:26, 'Month'], format='%Y-%B').dt.strftime('%Y-%m'),
    pd.to_datetime(teu_data_oakland.loc[27:, 'Year'], format='%Y-%m').dt.strftime('%Y-%m')
]).reset_index(drop=True)

# Display the modified dataframe
teu_data_oakland.drop(columns=['Year', 'Month', 'Total \nFull', 'Total\nEmpty'], inplace=True)
teu_data_oakland.rename(columns={'Grand\nTotal': 'Total TEUs',
                                 'Import\nFull':'Loaded Imports',
                                 'Import\nEmpty':'Empty Imports',
                                 'Export \nFull':'Loaded Exports',
                                 'Export\nEmpty':'Empty Exports',
                                 }, inplace=True)

teu_data_oakland['Total Imports'] = teu_data_oakland['Loaded Imports'] + teu_data_oakland['Empty Imports']
teu_data_oakland['Total Exports'] = teu_data_oakland['Loaded Exports'] + teu_data_oakland['Empty Exports']

Generar set de datos de incautacion (incluir coordenadas)

In [19]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'SAN FRANCISCO FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

# Ensure all values in 'FY' column are strings
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].astype(str)

# Replace ' (FYTD)' with an empty string
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].str.replace(' (FYTD)', '', regex=False)

drug_seizures_combined['Date'] = drug_seizures_combined['FY'] + '-' + drug_seizures_combined['Month (abbv)'].astype(str)
drug_seizures_combined['Date'] = drug_seizures_combined['Date'].str.replace(r'-(\w{3})$', lambda x: '-' + x.group(1).capitalize(), regex=True)

# Convert 'Date' column to datetime format and then to '%Y-%m'
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date'], format='%Y-%b').dt.strftime('%Y-%m')

# Drop the original 'FY' and 'Month (abbv)' columns if no longer needed
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)', 'Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)

In [20]:
# row 13: SFBA, CA
drug_seizures_combined['latitude'] = us_port_hubs.loc[13, 'coord_0']
drug_seizures_combined['longitude'] = us_port_hubs.loc[13, 'coord_1']

In [21]:
# subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index=['Date','latitude','longitude'], columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Join de los datos de contenedores con incautaciones:

In [22]:
SFBA_data = pd.merge(teu_data_oakland, drug_seizures_wide, on='Date', how='inner')

Acoplar datos de 'San Francisco Bay Area'

In [23]:
# Concatenate SFBA_data with combined_data
combined_data = pd.concat([combined_data, SFBA_data], ignore_index=True)

# Sort the combined dataframe by 'Date' in ascending order
combined_data.sort_values(by='Date', ascending=True, inplace=True)

# Reset the index of the sorted dataframe
combined_data.reset_index(drop=True, inplace=True)

### Puerto de Newark/New York:
Fuente: https://www.panynj.gov/port/en/our-port/facts-and-figures.html

In [24]:
# Read the Excel file and create a dataframe
teu_data_newark = pd.read_excel('../sources/TEUs_PortNewark_2025.xls', header=0, thousands=',')

In [25]:
teu_data_newark['Date'] = pd.to_datetime(teu_data_newark['Unnamed: 0'].astype(str) + '-' + teu_data_newark['Unnamed: 1'], format='%Y-%B').dt.strftime('%Y-%m')
teu_data_newark.drop(columns=['Unnamed: 0', 'Unnamed: 1'], inplace=True)

teu_data_newark.rename(columns={
                                'Loads Import TEUs':'Loaded Imports',
                                'Empties Import TEUs':'Empty Imports',
                                'Loads Export TEUs':'Loaded Exports',
                                'Empties Export TEUs':'Empty Exports'
                                }, inplace=True)

teu_data_newark['Total Imports'] = teu_data_newark['Loaded Imports'] + teu_data_newark['Empty Imports']
teu_data_newark['Total Exports'] = teu_data_newark['Loaded Exports'] + teu_data_newark['Empty Exports']

Generar dataset de incautacion de drogas + coordenadas

In [26]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'NEW YORK FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

# Ensure all values in 'FY' column are strings
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].astype(str)

# Replace ' (FYTD)' with an empty string
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].str.replace(' (FYTD)', '', regex=False)

drug_seizures_combined['Date'] = drug_seizures_combined['FY'] + '-' + drug_seizures_combined['Month (abbv)'].astype(str)
drug_seizures_combined['Date'] = drug_seizures_combined['Date'].str.replace(r'-(\w{3})$', lambda x: '-' + x.group(1).capitalize(), regex=True)

# Convert 'Date' column to datetime format and then to '%Y-%m'
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date'], format='%Y-%b').dt.strftime('%Y-%m')

# Drop the original 'FY' and 'Month (abbv)' columns if no longer needed
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)', 'Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)

In [27]:
# row 6: NJ/NY
drug_seizures_combined['latitude'] = us_port_hubs.loc[6, 'coord_0']
drug_seizures_combined['longitude'] = us_port_hubs.loc[6, 'coord_1']

In [28]:
# subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index=['Date','latitude','longitude'], columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Hacer join entre contenedores e incautaciones en Newark:

In [29]:
NJNY_data = pd.merge(teu_data_newark, drug_seizures_wide, on='Date', how='inner')

Acoplar los datos de Newark al resto del dataframe:

In [30]:
# Concatenate SFBA_data with combined_data
combined_data = pd.concat([combined_data, NJNY_data], ignore_index=True)

# Sort the combined dataframe by 'Date' in ascending order
combined_data.sort_values(by='Date', ascending=True, inplace=True)

# Reset the index of the sorted dataframe
combined_data.reset_index(drop=True, inplace=True)

### Puerto de Houston:
- En primer lugar se aplicarán los datos del puerto de Houston.
- Otros: beaumont, texas city, port freeport, port arthur, galveston

In [31]:
# Read the Excel file and create a dataframe
houston_teu_data = pd.read_excel('../sources/Container-Volume-TEU-stats-in-depth-January-2025.xls', sheet_name='Port_Houston', header=0)

In [32]:
# Display the dataframe
houston_teu_data.drop(columns=['Loaded Total','Empty Total'], inplace=True)
houston_teu_data.rename(columns={'Loaded and Empty\nTotal': 'Total TEUs'}, inplace=True)
houston_teu_data['Total Imports'] = houston_teu_data['Loaded Imports'] + houston_teu_data['Empty Imports']
houston_teu_data['Total Exports'] = houston_teu_data['Loaded Exports'] + houston_teu_data['Empty Exports']
houston_teu_data['Date'] = pd.to_datetime(houston_teu_data['Date'], format='%b‐%y').dt.strftime('%Y-%m')

Generar dataset de incautacion de drogas + coordenadas

In [33]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'HOUSTON FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

# Ensure all values in 'FY' column are strings
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].astype(str)

# Replace ' (FYTD)' with an empty string
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].str.replace(' (FYTD)', '', regex=False)

drug_seizures_combined['Date'] = drug_seizures_combined['FY'] + '-' + drug_seizures_combined['Month (abbv)'].astype(str)
drug_seizures_combined['Date'] = drug_seizures_combined['Date'].str.replace(r'-(\w{3})$', lambda x: '-' + x.group(1).capitalize(), regex=True)

# Convert 'Date' column to datetime format and then to '%Y-%m'
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date'], format='%Y-%b').dt.strftime('%Y-%m')

# Drop the original 'FY' and 'Month (abbv)' columns if no longer needed
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)', 'Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)

In [34]:
# row 2: Houston/Galveston, TX
drug_seizures_combined['latitude'] = us_port_hubs.loc[2, 'coord_0']
drug_seizures_combined['longitude'] = us_port_hubs.loc[2, 'coord_1']

In [35]:
# subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index=['Date','latitude','longitude'], columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Hacer join entre contenedores e incautaciones en Houston:

In [36]:
Houston_data = pd.merge(houston_teu_data, drug_seizures_wide, on='Date', how='inner')

Acoplar los datos de Houston al resto del dataframe:

In [37]:
# Concatenate SFBA_data with combined_data
combined_data = pd.concat([combined_data, Houston_data], ignore_index=True)

# Sort the combined dataframe by 'Date' in ascending order
combined_data.sort_values(by='Date', ascending=True, inplace=True)

# Reset the index of the sorted dataframe
combined_data.reset_index(drop=True, inplace=True)

### Puerto de Miami/Fort Lauderdale:


Para construir el dataset:
- https://www.porteverglades.net/
- https://assets.simpleviewinc.com/simpleview/image/upload/v1/clients/porteverglades/January_Monthly_Loaded_TEUs_02_13_2025_9e6f70c1-032a-4bbc-8f69-2d027dcfa394.pdf
- https://assets.simpleviewinc.com/simpleview/image/upload/v1/clients/porteverglades/FY2023_September_TEUS_Loaded_by_Month_7065b619-cddd-4b97-a02c-8c247c743ac0.pdf
- https://assets.simpleviewinc.com/simpleview/image/upload/v1/clients/porteverglades/August_TEUS_Loaded_by_Month_Fiscal_2020_Calander_2020_9439e3f7-7154-4030-a9b8-9ec592a57fde.pdf

- https://assets.simpleviewinc.com/simpleview/image/upload/v1/clients/porteverglades/Preliminary_Waterborne_Commerce_Chart_2024__1a6b16e6-94b2-46ac-89e7-d7080779f346.pdf


Datos de Port Everglades, FL (Puerto de Fort Lauderdale)

In [38]:
# Read the Excel file and create a dataframe
teu_data_everglades = pd.read_excel('../sources/TEUs_PortEverglades_2025.xls', header=0)

In [None]:
# Display rows with NaN values in teu_data_everglades
teu_data_everglades[teu_data_everglades.isna().any(axis=1)]

Se observa que faltan los últimos meses del año fiscal 2021. Sin embargo, se tienen los datos totales para cada año fiscal. Estos missing values se van a tratar de inferir a partir del contexto.

In [39]:
# Read the Excel file and specify the header row
file_path = '../sources/Preliminary_Waterborne_Commerce_Chart_2024__1a6b16e6-94b2-46ac-89e7-d7080779f346.xls'
df = pd.read_excel(file_path, header=1, usecols='A:G')

# Extract rows 23 and 24 (note that pandas uses 0-based indexing, so we need to adjust the row numbers)
extracted_rows = df.iloc[20:22]

# Display the extracted rows
extracted_rows = extracted_rows.T

# Reset the index and set the first row as the header
extracted_rows.columns = extracted_rows.iloc[0]
extracted_rows = extracted_rows[1:]

extracted_rows = extracted_rows[::-1]

Para asegurarse de que los datos anuales son, prácticamente, coincidentes con la suma de los mensuales, se va a proceder a su cálculo

In [40]:
# Group the dataframe by every 12 rows and sum the 'Loaded Exports' and 'Loaded Imports' columns
grouped_sums = teu_data_everglades.groupby(teu_data_everglades.index // 12)[['Loaded Exports', 'Loaded Imports']].sum()
grouped_sums['TEUs Loaded'] = grouped_sums['Loaded Exports'] + grouped_sums['Loaded Imports']

In [41]:
# Create a new dataframe with 'grouped_sums['TEUs Loaded']' and 'extracted_rows['TEUs Loaded']'
comparison_df = pd.DataFrame({
    'Grouped TEUs Loaded': grouped_sums['TEUs Loaded'].values[:6],
    'Extracted TEUs Loaded': extracted_rows['TEUs Loaded'].astype(float).values
})

# Add a new column that is the difference between both columns
comparison_df['Difference'] = comparison_df['Grouped TEUs Loaded'] - comparison_df['Extracted TEUs Loaded']

Comparar los datos mensuales con los datos anuales:

Para el año 2021, faltan 3 meses (NAs) pero se tienen los datos de 'Total Loaded' y 'Total TEUs'

In [42]:
teu_data_everglades.iloc[24:36].to_csv('../sources/TEUs_PortEverglades_2023_FY.csv', index=False, sep=',')
everglades_2021 = pd.read_csv('../sources/TEUs_PortEverglades_2023_FY.csv', sep=',')

In [43]:
# Compute proportions
prop_imports = everglades_2021['Loaded Imports'].sum(skipna=True) / (everglades_2021['Loaded Imports'].sum(skipna=True) + everglades_2021['Loaded Exports'].sum(skipna=True))
prop_exports = 1 - prop_imports

# Compute missing values
missing_total = 186169
impute_value_imports = round((missing_total * prop_imports) / everglades_2021['Loaded Imports'].isna().sum())
impute_value_exports = round((missing_total * prop_exports) / everglades_2021['Loaded Exports'].isna().sum())

# Impute missing values
everglades_2021.loc[everglades_2021['Loaded Imports'].isna(), 'Loaded Imports'] = impute_value_imports
everglades_2021.loc[everglades_2021['Loaded Exports'].isna(), 'Loaded Exports'] = impute_value_exports

Imputar ahora el resto de valores:

In [44]:
extracted_rows['TEUs Empty'] = extracted_rows['TEUs Total'] - extracted_rows['TEUs Loaded']
emptys_2021 = everglades_2021['Empty Imports'].sum(skipna=True) + everglades_2021['Empty Exports'].sum(skipna=True)
extracted_rows.loc[2021]['TEUs Empty'] - emptys_2021

76191.0

In [45]:
# Compute proportions
prop_imports = everglades_2021['Empty Imports'].sum(skipna=True) / (everglades_2021['Empty Imports'].sum(skipna=True) + everglades_2021['Empty Exports'].sum(skipna=True))
prop_exports = 1 - prop_imports

# Compute missing values
missing_total = extracted_rows.loc[2021]['TEUs Empty'] - emptys_2021
impute_value_imports = round((missing_total * prop_imports) / everglades_2021['Empty Imports'].isna().sum())
impute_value_exports = round((missing_total * prop_exports) / everglades_2021['Empty Exports'].isna().sum())

# Impute missing values
everglades_2021.loc[everglades_2021['Empty Imports'].isna(), 'Empty Imports'] = impute_value_imports
everglades_2021.loc[everglades_2021['Empty Exports'].isna(), 'Empty Exports'] = impute_value_exports

# Update 'Total Imports', 'Total Exports', and 'Total TEUs' for rows 9 to 11 in 'everglades_2021'
everglades_2021.loc[9:11, 'Total Imports'] = everglades_2021.loc[9:11, 'Empty Imports'] + everglades_2021.loc[9:11, 'Loaded Imports']
everglades_2021.loc[9:11, 'Total Exports'] = everglades_2021.loc[9:11, 'Empty Exports'] + everglades_2021.loc[9:11, 'Loaded Exports']
everglades_2021.loc[9:11, 'Total TEUs'] = everglades_2021.loc[9:11, 'Total Imports'] + everglades_2021.loc[9:11, 'Total Exports']

# Imputar valores a las filas con NAs del dataframe de teu_data_everglades:
teu_data_everglades[33:36] = everglades_2021[9:12]

Una vez cubiertos los missing values toca revisar que las columnas coinciden con lo esperado:

In [46]:
teu_data_everglades.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)

Generar dataset de incautacion de drogas + coordenadas para el puerto de Fort Lauderdale/Everglades:

In [47]:
# Loop through each CSV file
for file in csv_files:
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file)
    
    # Append the filtered dataframe to the list
    drug_seizures_dataframes.append(df)

# Concatenate all filtered dataframes into a single dataframe
drug_seizures_combined = pd.concat(drug_seizures_dataframes, ignore_index=True)

# Drop duplicates
drug_seizures_combined.drop_duplicates(inplace=True)
drug_seizures_combined = drug_seizures_combined.loc[drug_seizures_combined['Area of Responsibility'] == 'MIAMI FIELD OFFICE']
drug_seizures_combined.reset_index(drop=True, inplace=True)

# Ensure all values in 'FY' column are strings
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].astype(str)

# Replace ' (FYTD)' with an empty string
drug_seizures_combined['FY'] = drug_seizures_combined['FY'].str.replace(' (FYTD)', '', regex=False)

drug_seizures_combined['Date'] = drug_seizures_combined['FY'] + '-' + drug_seizures_combined['Month (abbv)'].astype(str)
drug_seizures_combined['Date'] = drug_seizures_combined['Date'].str.replace(r'-(\w{3})$', lambda x: '-' + x.group(1).capitalize(), regex=True)

# Convert 'Date' column to datetime format and then to '%Y-%m'
drug_seizures_combined['Date'] = pd.to_datetime(drug_seizures_combined['Date'], format='%Y-%b').dt.strftime('%Y-%m')

# Drop the original 'FY' and 'Month (abbv)' columns if no longer needed
drug_seizures_combined.drop(columns=['FY', 'Month (abbv)', 'Component', 'Region', 'Land Filter', 'Area of Responsibility'], inplace=True)

In [48]:
# row 9: Miami-Fort Lauderdale, FL
drug_seizures_combined['latitude'] = us_port_hubs.loc[9, 'coord_0']
drug_seizures_combined['longitude'] = us_port_hubs.loc[9, 'coord_1']

In [49]:
# subdataframe = drug_seizures_combined[['Drug Type', 'Count of Event', 'Sum Qty (lbs)']]
drug_seizures_wide = pd.pivot_table(drug_seizures_combined, index=['Date','latitude','longitude'], columns='Drug Type', values=['Count of Event', 'Sum Qty (lbs)'], aggfunc='sum')
drug_seizures_wide.columns = ['_'.join(col).strip() for col in drug_seizures_wide.columns.values]
drug_seizures_wide.reset_index(inplace=True)
drug_seizures_wide['Sum_of_Counts'] = drug_seizures_wide.filter(like='Count of Event_').sum(axis=1)
# Adjust the dates for 'October', 'November', and 'December' by anticipating one year
def adjust_fiscal_year(date_str):
    date = pd.to_datetime(date_str, format='%Y-%m')
    if date.month in [10, 11, 12]:
        date = date + pd.DateOffset(years=-1)
    return date.strftime('%Y-%m')

# Apply the adjustment to the 'Date' column in combined_teu_data
drug_seizures_wide['Date'] = drug_seizures_wide['Date'].apply(adjust_fiscal_year)
drug_seizures_wide.sort_values(by='Date', ascending=True, inplace=True)
drug_seizures_wide.reset_index(drop=True, inplace=True)

Hacer join entre contenedores e incautaciones en Houston:

In [50]:
Miami_data = pd.merge(teu_data_everglades, drug_seizures_wide, on='Date', how='inner')

In [51]:
# Concatenate Miami_data with combined_data
combined_data = pd.concat([combined_data, Miami_data], ignore_index=True)

# Sort the combined dataframe by 'Date' in ascending order
combined_data.sort_values(by='Date', ascending=True, inplace=True)

# Reset the index of the sorted dataframe
combined_data.reset_index(drop=True, inplace=True)

### Resumen y otros puertos posibles
- San Diego
- Nueva Orleans
- Tampa *
- Savannah **
- Boston *
- Chicago
- Corpus Christi 

In [53]:
combined_data.to_csv('../sources/six_ports_USA_drugs.csv', index=False, sep=',')

¿Incluir la variación de población en Áreas Metropolitanas?

### Visualizaciones:
- Comparar series temporales
- Comparar volumen según posición geográfica
- Una serie temporal por el target
- Comparar tipos de target
- Análisis de variables por puerto

In [55]:
combined_data.columns

Index(['Date', 'Empty Exports', 'Loaded Exports', 'Empty Imports',
       'Loaded Imports', 'Total TEUs', 'Total Imports', 'Total Exports',
       'latitude', 'longitude', 'Count of Event_Cocaine',
       'Count of Event_Ecstasy', 'Count of Event_Fentanyl',
       'Count of Event_Heroin', 'Count of Event_Ketamine',
       'Count of Event_Khat (Catha Edulis)', 'Count of Event_Lsd',
       'Count of Event_Marijuana', 'Count of Event_Methamphetamine',
       'Count of Event_Other Drugs**', 'Sum Qty (lbs)_Cocaine',
       'Sum Qty (lbs)_Ecstasy', 'Sum Qty (lbs)_Fentanyl',
       'Sum Qty (lbs)_Heroin', 'Sum Qty (lbs)_Ketamine',
       'Sum Qty (lbs)_Khat (Catha Edulis)', 'Sum Qty (lbs)_Lsd',
       'Sum Qty (lbs)_Marijuana', 'Sum Qty (lbs)_Methamphetamine',
       'Sum Qty (lbs)_Other Drugs**', 'Sum_of_Counts'],
      dtype='object')