# ***Dependencias***





In [1]:
import sys
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the project directory path as the laboratory_2 folder
project_dir = '/content/drive/MyDrive/clasificacion_2_niveles/laboratory_2'

# Add the *parent* directory of laboratory_2 to sys.path
# This might still be necessary if other parts of your project rely on imports
# from the higher level directory.
parent_project_dir = '/content/drive/MyDrive/clasificacion_2_niveles'
if parent_project_dir not in sys.path:
    sys.path.append(parent_project_dir)
    print(f"Added {parent_project_dir} to sys.path")


# Change the current directory to the new project directory
# This is crucial for relative file paths within your laboratory_2 folder
if os.path.exists(project_dir):
    os.chdir(project_dir)
    print(f"Changed current directory to: {os.getcwd()}")
else:
    print(f"Project directory not found at: {project_dir}. Cannot change directory.")


# Add the project directory (laboratory_2) to sys.path
# This allows importing modules directly from within laboratory_2
if project_dir not in sys.path:
    sys.path.append(project_dir)
    print(f"Added {project_dir} to sys.path")


# Optional: Verify the updated sys.path and current working directory
print("\nUpdated sys.path:")
for p in sys.path:
    print(p)

print("\nCurrent working directory:")
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Added /content/drive/MyDrive/clasificacion_2_niveles to sys.path
Changed current directory to: /content/drive/MyDrive/clasificacion_2_niveles/laboratory_2
Added /content/drive/MyDrive/clasificacion_2_niveles/laboratory_2 to sys.path

Updated sys.path:
/content
/env/python
/usr/lib/python311.zip
/usr/lib/python3.11
/usr/lib/python3.11/lib-dynload

/usr/local/lib/python3.11/dist-packages
/usr/lib/python3/dist-packages
/usr/local/lib/python3.11/dist-packages/IPython/extensions
/usr/local/lib/python3.11/dist-packages/setuptools/_vendor
/root/.ipython
/content/drive/MyDrive/clasificacion_2_niveles
/content/drive/MyDrive/clasificacion_2_niveles/laboratory_2

Current working directory:
/content/drive/MyDrive/clasificacion_2_niveles/laboratory_2


In [2]:
import torch
import torch.nn as nn
from transformers import BertModel

class HierarchicalBertClassifierWithConstraint(nn.Module):
    def __init__(self, bert_model, num_labels_per_level, hierarchy_map_encoded, class_weights): # Accept class_weights
        super().__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(bert_model.config.hidden_dropout_prob)

        # Define classification heads for each level
        self.classifier_categoria = nn.Linear(bert_model.config.hidden_size, num_labels_per_level['nombre_categoria_encoded'])
        self.classifier_familia = nn.Linear(
            bert_model.config.hidden_size + num_labels_per_level['nombre_categoria_encoded'],
            num_labels_per_level['familia_encoded']
        )
        # Removed classifier_linea and classifier_subfamilia

        self.hierarchy_map_encoded = hierarchy_map_encoded
        self.class_weights = class_weights # Store class_weights

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )

        pooled_output = outputs.hidden_states[-1][:, 0, :]
        pooled_output = self.dropout(pooled_output)

        # --- Hierarchical Classification ---

        logits_categoria = self.classifier_categoria(pooled_output)

        input_to_familia = torch.cat((pooled_output, logits_categoria.detach()), dim=-1)
        logits_familia = self.classifier_familia(input_to_familia)

        # Removed logits_linea and logits_subfamilia calculations


        logits = {
            'nombre_categoria_encoded': logits_categoria,
            'familia_encoded': logits_familia,
            # Removed linea_encoded and subfamilia_encoded from logits
        }

        loss = None
        if labels is not None:
            # Use class weights for CrossEntropyLoss for each level
            loss_fct_categoria = nn.CrossEntropyLoss(weight=self.class_weights['nombre_categoria_encoded'])
            loss_fct_familia = nn.CrossEntropyLoss(weight=self.class_weights['familia_encoded'])
            # Removed loss_fct_linea and loss_fct_subfamilia

            loss_categoria = loss_fct_categoria(logits_categoria, labels[:, 0])
            loss_familia = loss_fct_familia(logits_familia, labels[:, 1])
            # Removed loss_linea and loss_subfamilia calculations

            # --- Calculate Hierarchical Penalty ---
            hierarchical_penalty = self.calculate_hierarchical_penalty(logits, labels)

            # Total loss = sum of individual weighted losses + hierarchical penalty
            loss = loss_categoria + loss_familia + hierarchical_penalty # Updated loss calculation

        return {
            'logits': logits,
            'loss': loss
        }

    def calculate_hierarchical_penalty(self, logits, labels):
        penalty = 0
        batch_size = labels.size(0)
        device = labels.device # Get device from labels

        # Penalty for inconsistent familia prediction given the true categoria
        # Get predicted *indices* for penalty calculation
        predicted_categorias_indices = torch.argmax(logits['nombre_categoria_encoded'], dim=1)
        true_familias_indices = labels[:, 1] # True indices

        for i in range(batch_size):
            true_categoria_encoded = labels[i, 0].item()
            predicted_familia_encoded = torch.argmax(logits['familia_encoded'][i]).item() # Predicted index

            # Check if the predicted familia is a valid child of the true categoria
            # Access the hierarchy map using encoded indices
            valid_familias_for_categoria = self.hierarchy_map_encoded.get('nombre_categoria_encoded', {}).get(str(true_categoria_encoded), []) # Ensure key is a string
            if predicted_familia_encoded not in valid_familias_for_categoria:
                penalty += 1 # Simple penalty, you can adjust the value

        # Removed penalty calculations for linea and subfamilia

        # Convert penalty to a tensor on the correct device
        return torch.tensor(penalty, dtype=torch.float, device=device) # You might want to scale this penalty

In [3]:
import torch
import pickle
from transformers import BertModel, BertConfig, BertTokenizer

import pandas as pd
import numpy as np
from time import time
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import warnings
import json

nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize_text(text):
  return word_tokenize(text)

import warnings

from tools_2 import get_data, load_dictionaries, build_import_data, preprocess_inference_data, analyze_coverage,filter_predictions_apply


# Define paths to the saved model and encoders
model_load_path = os.path.join(project_dir, './modelos/BERT_CLASIFICADOR_ARTICULOS_NIVELES_2_V1_retrain_pipeline.pth')
encoders_load_path = os.path.join(project_dir, './diccionarios/codificador_retrain_V2.pkl')

# Load the codificador object (encoders)
with open(encoders_load_path, 'rb') as f:
    codificador = pickle.load(f)

# Get the number of unique labels per level from the loaded codificador
num_labels_per_level = {}
for col, encoder in codificador.label_encoders.items():
    num_labels_per_level[f'{col}_encoded'] = len(encoder.classes_)


class_weights_load_path = './diccionarios/class_weights_retrain_V2.pth'

try:
    # Load the class weights from the .pth file.
    class_weights = torch.load(class_weights_load_path)
    # Print a confirmation message.
    print(f"Class weights loaded successfully from {class_weights_load_path}")
except FileNotFoundError:
    # Handle the case where the specified file does not exist.
    print(f"Error: The class weights file was not found at {class_weights_load_path}.")
    print("Please verify the file path in Google Drive.")
except Exception as e:
    # Handle any other unexpected errors during loading.
    print(f"An unexpected error occurred while loading class weights: {e}")



hierarchy_map_save_path = './diccionarios/hierarchy_map_encoded_retrain_V2.json'

try:
    # Open the JSON file for reading.
    with open(hierarchy_map_save_path, 'r') as f:
        # Load the data from the JSON file into the hierarchy_map_encoded dictionary.
        hierarchy_map_encoded = json.load(f)
    # Print a confirmation message.
    print(f"Hierarchy map loaded successfully from {hierarchy_map_save_path}")


except FileNotFoundError:
    # Handle the case where the specified file does not exist.
    print(f"Error: The file was not found at {hierarchy_map_save_path}.")
    print("Please verify the file path in Google Drive.")
except json.JSONDecodeError:
    # Handle the case where the file content is not valid JSON.
    print(f"Error: Could not decode JSON from {hierarchy_map_save_path}.")
    print("Please ensure the file contains valid JSON data.")
except Exception as e:
    # Handle any other unexpected errors during loading.
    print(f"An unexpected error occurred: {e}")

# Suppress all warnings
warnings.filterwarnings("ignore")


"""
Dependencies required to run this notebook:
- google-colab
- pandas
- numpy
- torch
- transformers
- scikit-learn
- tqdm
- nltk
- lab_utils (custom/local module, ensure it's available in your project directory)

To install the main dependencies, run:
%pip install pandas numpy torch transformers scikit-learn tqdm nltk

For Google Colab, the 'google.colab' package is pre-installed.
For 'lab_utils', make sure the Python file or package is present in the specified Google Drive directory.
"""

Class weights loaded successfully from ./diccionarios/class_weights_retrain_V2.pth
Hierarchy map loaded successfully from ./diccionarios/hierarchy_map_encoded_retrain_V2.json


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


"\nDependencies required to run this notebook:\n- google-colab\n- pandas\n- numpy\n- torch\n- transformers\n- scikit-learn\n- tqdm\n- nltk\n- lab_utils (custom/local module, ensure it's available in your project directory)\n\nTo install the main dependencies, run:\n%pip install pandas numpy torch transformers scikit-learn tqdm nltk\n\nFor Google Colab, the 'google.colab' package is pre-installed.\nFor 'lab_utils', make sure the Python file or package is present in the specified Google Drive directory.\n"

#### Instancias del MODELO AFINADO


In [4]:
class ArticulosPredictionDataset(Dataset):
    """
    PyTorch Dataset for prediction/inference.
    Tokenizes input corpus using the provided tokenizer.
    """
    def __init__(self, corpus_series, tokenizer, max_len):
        self.corpus = corpus_series.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        corpus = self.corpus[idx]
        encoding = self.tokenizer(
            corpus,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }
        return item

In [5]:
def predict_with_constrained_decoding(
    df,
    model,
    tokenizer,
    codificador,
    device,
    hierarchy_map_encoded,
    batch_size=580,
    max_len=500 # Assuming you have a max_len for tokenization
):
    """
    Makes hierarchical predictions on input data with constraints based on the
    provided hierarchy map.

    Args:
        df (pd.DataFrame): DataFrame containing the input data, must have a 'corpus' column.
        model (torch.nn.Module): The trained PyTorch model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used during training.
        codificador (DataProcessor): The DataProcessor object used for label encoding/decoding.
        device (torch.device): The device to run inference on (cuda or cpu).
        hierarchy_map_encoded (dict): The dictionary mapping parent encoded labels to valid child encoded labels.
        batch_size (int): Batch size for prediction.
        max_len (int): Maximum token length for input sequences.

    Returns:
        pd.DataFrame: DataFrame with original data plus columns for predicted labels and
                      probabilities for each level. Returns None if 'corpus' column is missing.
    """
    # Check for 'corpus' column
    if 'corpus' not in df.columns:
        print("Missing required column: 'corpus'. Please add it before prediction.")
        return None

    # Prepare dataset and dataloader
    # ArticulosPredictionDataset should take a Series (corpus), tokenizer, and max_len
    try:
        test_dataset = ArticulosPredictionDataset(df['corpus'], tokenizer, max_len=max_len)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    except NameError:
        print("Error: ArticulosPredictionDataset class not found. Please ensure it's defined or imported.")
        return None


    model.eval()

    # Lists to store predictions (encoded) and probabilities
    predictions_categoria_encoded = []
    predictions_familia_encoded = []


    probabilities_categoria = []
    probabilities_familia = []



    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Generating Constrained Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # Labels are NOT needed for pure inference

            # Get logits from the model
            # Note: The model's forward should handle the case where labels are None
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']


            # --- Constrained Prediction Logic (Batch-wise) ---

            # 1. Predict Category (no constraint at the top level)
            logits_categoria = logits['nombre_categoria_encoded']
            preds_categoria = torch.argmax(logits_categoria, dim=1)
            # Calculate probability of the predicted category
            probs_categoria_batch = torch.softmax(logits_categoria, dim=1)
            probs_categoria_predicted = torch.gather(probs_categoria_batch, 1, preds_categoria.unsqueeze(1)).squeeze(1)


            # 2. Predict Familia (constrained by predicted categoria)
            logits_familia = logits['familia_encoded']
            preds_familia = torch.zeros_like(preds_categoria)
            probs_familia_predicted_batch = torch.zeros_like(preds_categoria, dtype=torch.float)

            for i in range(preds_categoria.size(0)): # Iterate through each sample in the batch
                predicted_categoria_encoded = preds_categoria[i].item()

                # Get valid families for the predicted category
                valid_familias_encoded = hierarchy_map_encoded.get('nombre_categoria_encoded', {}).get(str(predicted_categoria_encoded), []) # Ensure key is a string

                if not valid_familias_encoded:
                    # If no valid children, predict based on unconstrained logits (fallback)
                     preds_familia[i] = torch.argmax(logits_familia[i]).item()
                     probs_familia_predicted_batch[i] = torch.softmax(logits_familia[i], dim=-1)[preds_familia[i]].item()
                else:
                    # Create a mask for valid familia logits for this sample
                    mask = torch.full(logits_familia[i].shape, -float('inf'), device=device)
                    valid_indices = torch.tensor(valid_familias_encoded, device=device)
                    mask[valid_indices] = 0 # Set valid logits mask to 0

                    # Apply the mask and find the argmax among valid options
                    constrained_logits_familia = logits_familia[i] + mask
                    preds_familia[i] = torch.argmax(constrained_logits_familia).item()

                    # Calculate the probability of the predicted valid class within the valid subset
                    valid_logits_subset = logits_familia[i][valid_indices]
                    probs_familia_subset = torch.softmax(valid_logits_subset, dim=-1)
                    # Find the index of the predicted familia within the valid_indices list
                    index_in_valid = (valid_indices == preds_familia[i]).nonzero(as_tuple=True)[0].item()
                    probs_familia_predicted_batch[i] = probs_familia_subset[index_in_valid].item()


            # # 3. Predict Linea (constrained by predicted familia)



            # Append predictions (encoded) and probabilities
            predictions_categoria_encoded.extend(preds_categoria.cpu().numpy())
            predictions_familia_encoded.extend(preds_familia.cpu().numpy())


            probabilities_categoria.extend(probs_categoria_predicted.cpu().numpy())
            probabilities_familia.extend(probs_familia_predicted_batch.cpu().numpy()) # Use the calculated constrained probability

    # Create a DataFrame of encoded predictions
    encoded_predictions_df = pd.DataFrame({
        'nombre_categoria_encoded': predictions_categoria_encoded,
        'familia_encoded': predictions_familia_encoded,

    })

    # Decode the predictions using the codificador

    decoded_predictions_df = codificador.decode_labels(encoded_predictions_df)

    # Combine original data with predictions and probabilities

    results_df = df.copy()

    results_df['Predicted_nombre_categoria'] = decoded_predictions_df['nombre_categoria'].values
    results_df['Prob_nombre_categoria'] = probabilities_categoria
    results_df['descripcion_full'] = df_testing['descripcion_full']

    results_df['Predicted_familia'] = decoded_predictions_df['familia'].values
    results_df['Prob_familia'] = probabilities_familia



    return results_df

In [6]:
# Load the pre-trained BERT model

bert_model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

# Initialize the custom multi-output classifier model

model = HierarchicalBertClassifierWithConstraint(bert_model, num_labels_per_level, hierarchy_map_encoded, class_weights)

# Load the saved model state dictionary, mapping tensors to CPU if CUDA is not available

model.load_state_dict(torch.load(model_load_path, map_location=torch.device('cpu')))

# Set the model to evaluation mode

model.eval()

# Define the device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the tokenizer

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# ***Fin de las Dependencias***

# **DATOS**

#### carga de datos

In [7]:
ruta1 = './datos/dicaranceles.xlsx'
#ruta2 = './datos/datatesteo.xlsx'
#ruta3 = './datos/test_data_v1.csv'
ruta3 = './datos/data_importacion_2024.csv'

#df_testing = get_data(ruta2)  # esta ruta sirve para hacer inferencia de cualquier otra muestra

df_testing = get_data(ruta3)

dic_aranceles = get_data(ruta1)

df_testing = df_testing.rename(columns={'Índice': 'id_key'})

In [8]:
df_testing.shape

(7362802, 5)

In [9]:
def get_descripcion_simple(text):
    text = str(text)
    words = text.split()
    if len(words) == 0:
        return ""

    # If starts with "REF", skip it and the next word (the code)
    if len(words) > 2 and words[0].upper() == "REF":
        words = words[2:]

    connecting_words = { 'para', 'de', 'con', 'en', 'por', 'al', 'del' }
    result = []
    i = 0
    max_words = 3  # Cambia esto si quieres otro número de palabras

    while i < len(words) and len(result) < max_words:
        word = words[i]
        # Siempre incluir la primera palabra
        if i == 0:
            result.append(word)
            i += 1
            continue
        # Si es una preposición importante, incluirla y la siguiente palabra
        if word.lower() in connecting_words and i+1 < len(words):
            result.append(word)
            result.append(words[i+1])
            i += 2
            break
        # Para otras palabras, incluir hasta alcanzar el límite
        if len(result) < max_words:
            result.append(word)
            i += 1
    return ' '.join(result)

# Aplicar al DataFrame
df_testing['descripcion_simple'] = df_testing['descripcion'].apply(get_descripcion_simple)

# Mostrar resultados
print(df_testing[['descripcion', 'descripcion_simple']].tail())

                                            descripcion  \
7362797                           FAJAS PLASTICAS NA NA   
7362798       TENIS DE NINA TALLA 37 COLOR AZULES NA NA   
7362799  BLUSA DE DAMA TALLA XS AZUL ORIGEN INDIA NA NA   
7362800                     BISUTERIA EMBLEMATICA NA NA   
7362801                          VIDRIO TEMPERADO NA NA   

               descripcion_simple  
7362797        FAJAS PLASTICAS NA  
7362798             TENIS DE NINA  
7362799             BLUSA DE DAMA  
7362800  BISUTERIA EMBLEMATICA NA  
7362801       VIDRIO TEMPERADO NA  


In [10]:
df_testing['descripcion_full'] = df_testing['descripcion']
df_testing['descripcion'] = df_testing['descripcion_simple']

#### Transformacion de los datos


In [11]:
"""
se debe cambiar solo las columnas descripcion y cod_arancelario

"""
#import_data = build_import_data(df_testing,"DESCRIPCIÓN DEL DESPACHO","POSICIÓN ARANCELARIA",dic_aranceles) # este es de una muestra


import_data = build_import_data(df_testing,"descripcion","posicion_arancelaria",dic_aranceles)

In [12]:

# Define file paths to load from Google Drive
stopwords_load_path = './diccionarios/stopwords_spanish.json'
replacement_dict_load_path = './diccionarios/replacement_dict.json'


# Load stopwords_spanish
with open(stopwords_load_path, 'r') as f:
    stopwords_spanish = json.load(f)

# Load replacement_dict
with open(replacement_dict_load_path, 'r') as f:
    replacement_dict = json.load(f)

import_data = preprocess_inference_data(import_data, stopwords_spanish, replacement_dict)


# **MODELO**

# Predicciones

In [13]:
prediction_table_inference_data = predict_with_constrained_decoding(
    df=import_data,  # or any DataFrame with 'corpus'
    model=model,
    tokenizer=tokenizer,
    codificador=codificador,
    device=device,
    hierarchy_map_encoded=hierarchy_map_encoded # Pass the loaded hierarchy map
)

Generating Constrained Predictions: 100%|██████████| 12695/12695 [13:20:45<00:00,  3.78s/it]


#### Evaluacion de Cobertura de las predicciones


In [14]:
# Calculate the average probability across the four levels

prediction_table_inference_data['average_prob'] = prediction_table_inference_data[[
    'Prob_nombre_categoria', 'Prob_familia' # Corrected column names
]].mean(axis=1)

# Filter the DataFrame to include only rows where average_prob is above 0.86

cobertura_df = prediction_table_inference_data[
    prediction_table_inference_data['average_prob'] >= 0.86
]

# Count the number of rows in the filtered DataFrame

cobertura = len(cobertura_df)


# Calculate the total number of data points

total_data_points = len(import_data)

# Calculate the percentage of data points with average probability above 0.86

cobertura_percentage = (cobertura / total_data_points) * 100


print(f"The coverage percentage is: {cobertura_percentage:.2f}%")
print(f"The number of data points with an average probability above 0.86 (Cobertura) is: {cobertura}")

The coverage percentage is: 38.07%
The number of data points with an average probability above 0.86 (Cobertura) is: 2803301


# ***FILTRO MODULO***

In [15]:
ruta3 = './diccionarios/data_train_class_list.csv'
ruta4 = './diccionarios/keywords_filter.csv'


data_novex_list = get_data(ruta3)

keywords_filter = get_data(ruta4)

keywords_filter = pd.concat([keywords_filter, pd.DataFrame(['calzado seguridad', 'zapatos seguridad'], columns=['keyword'])], ignore_index=True)

result_predictions = prediction_table_inference_data

if 'keyword' in keywords_filter.columns:
    keywords_filter['keyword'] = keywords_filter['keyword'].astype(str)

result_predictions_filtered = result_predictions[result_predictions['average_prob'] >= 0.86] # filter by average_prob


result_predictions_filtered_updated, data_filtered = filter_predictions_apply(
    result_predictions_filtered,
    data_novex_list,
    keywords_filter,
    prediction_column='Predicted_familia', # Use your actual column names
    novex_detail_column='subfamilia',
    data_quality_column='data_quality',
    keywords_column='keyword',
    description_column='descripcion'
)

# You can then use result_predictions_filtered_updated and data_filtered
print("Filtered result_predictions_filtered:")
print(result_predictions_filtered_updated.head())

print("\nDropped rows (data_filtered):")
print(data_filtered.head())


Filtered result_predictions_filtered:
         id       fecha cod_arancelario  \
3  23301706  2024-01-16      4202990090   
5  23725393  2024-01-16      9608600000   
6  22740548  2024-01-16      4202990090   
7  23223189  2024-01-16      8536101099   
9  22724002  2024-01-16      8509809000   

                             descripcion   id_key  \
3    MIA000040584392 MOCHILA PARA COMIDA  1224866   
5              MIA000040582816 CARTUCHOS  1648863   
6              MIA000040588403 MALETIN -   663122   
7                MIA000040596385 FUSIBLE  1145955   
9  MIA000040598803 EXPRIMIDORA DE FRUTAS   646644   

                      descripcion_simple  \
3    MIA000040584392 MOCHILA PARA COMIDA   
5              MIA000040582816 CARTUCHOS   
6              MIA000040588403 MALETIN -   
7                MIA000040596385 FUSIBLE   
9  MIA000040598803 EXPRIMIDORA DE FRUTAS   

                                    descripcion_full  \
3                MIA000040584392 MOCHILA PARA COMIDA   
5      

# **REGLAS Y FILTROS**

reglas de clasificacion

In [16]:
unique_values = result_predictions_filtered_updated['descripcion_simple'].unique()


In [17]:
import_data.shape

(7362802, 9)

In [18]:
result_predictions_filtered_updated.shape

(2101963, 14)

In [19]:
unique_values.shape

(367614,)

In [20]:
import re

# Get unique values from simple_description
unique_values = result_predictions_filtered_updated['descripcion_simple'].unique()

# Remove any empty or NaN values
unique_values_clean = [str(v) for v in unique_values if pd.notna(v) and str(v).strip() != ""]

# Build a regex pattern that matches any of the keywords (escaped)
pattern = '|'.join([re.escape(v) for v in unique_values_clean if v])

# Use str.contains to find rows that match any keyword
mask = import_data['descripcion_full'].str.contains(pattern, case=False, na=False)

# Get the total number of coincidences
total_coincidences = mask.sum()
print(total_coincidences)

# HACER QUE COINCIDAN

KeyboardInterrupt: 

In [21]:
# and the description contains "guante" or "guantes", change Predicted_familia to 'GUANTES'.
guante_keywords = ["guante", "guantes"]

condition_guantes_rule = (result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'SEGURIDAD') & \
                         (result_predictions_filtered_updated['Predicted_familia'].str.upper() == 'EPP') & \
                         (result_predictions_filtered_updated['descripcion_2'].str.contains('|'.join(guante_keywords), case=False, na=False))

result_predictions_filtered_updated.loc[condition_guantes_rule, 'Predicted_familia'] = 'GUANTES'


In [22]:
import re


# Convertir columnas relevantes a strings
result_predictions_filtered_updated['Predicted_nombre_categoria'] = result_predictions_filtered_updated['Predicted_nombre_categoria'].astype(str)
result_predictions_filtered_updated['Predicted_familia'] = result_predictions_filtered_updated['Predicted_familia'].astype(str)
result_predictions_filtered_updated['descripcion_2'] = result_predictions_filtered_updated['descripcion_2'].astype(str)

# Definir parámetros de filtrado
target_phrases = {
    'calzado': [
        'LOS DEMAS CALZADOS',
        'CALZADO CON SUELA DE CAUCHO'
    ],
    'ropa_epp': [
    'CAMISA PUNTO',  # Nueva versión simplificada
    'BLUSA CAMISERA',
    'T-SHIRT CAMISETA',
    'SUETER JERSEY PULLOVER',
    'SOSTEN BRASSIER CORPIÑO',
    'CONJUNTO ABRIGO ENTRENAMIENTO DEPORTE'
    ],
    'guantes_ropa': [
        'CAMISAS, BLUSAS Y BLUSAS CAMISERAS, PARA MUJERES O NIÑAS',
        'CAMISAS PARA HOMBRES O NIÑOS',
        'CAMISAS, BLUSAS Y BLUSAS CAMISERAS, DE PUNTO, PARA MUJERES O NIÑAS',
        'CAMISAS, BLUSAS'
    ]
}

keywords_calzado = ['SEGURIDAD', 'INDUSTRIAL', 'EPP']

# Función mejorada para verificar coincidencias parciales
def check_phrases(text, phrases):
    text_lower = text.lower()
    for phrase in phrases:
        # Convertimos la frase objetivo en palabras clave
        keywords = [word for word in phrase.lower().replace(',', '').split()
                  if word not in {'y', 'de', 'las', 'los', 'para'}]

        # Verificamos si todas las palabras clave aparecen en el texto (en cualquier orden)
        if all(keyword in text_lower for keyword in keywords if len(keyword) > 3):  # Ignoramos palabras cortas
            return True
    return False


# Función para verificar palabras clave
def check_keywords(text, keywords):
    text_upper = text.upper()
    return any(keyword in text_upper for keyword in keywords)

# Crear máscaras de filtrado
masks = {
    'calzado': (
        (result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'SEGURIDAD') &
        (result_predictions_filtered_updated['Predicted_familia'].str.upper() == 'EPP') &
        result_predictions_filtered_updated['descripcion_2'].apply(lambda x: check_phrases(x, target_phrases['calzado'])) &
        ~result_predictions_filtered_updated['descripcion_2'].apply(lambda x: check_keywords(x, keywords_calzado))
    ),

    'ropa_epp': (
        (result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'SEGURIDAD') &
        (result_predictions_filtered_updated['Predicted_familia'].str.upper() == 'EPP') &
        result_predictions_filtered_updated['descripcion_2'].apply(lambda x: check_phrases(x, target_phrases['ropa_epp']))
    ),

    'guantes_ropa': (
        (result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'SEGURIDAD') &
        (result_predictions_filtered_updated['Predicted_familia'].str.upper() == 'GUANTES') &
        result_predictions_filtered_updated['descripcion_2'].apply(lambda x: check_phrases(x, target_phrases['guantes_ropa']))
    )
}

# Combinar todas las máscaras
total_mask = masks['calzado'] | masks['ropa_epp'] | masks['guantes_ropa']

# Filtrar el DataFrame
result_predictions_filtered_updated = result_predictions_filtered_updated[~total_mask].copy()

# ===========================================
# FILTER: GAFAS NO SEGURIDAD
# ===========================================
# Si 'descripcion_full' contiene "gafas" pero NO contiene ninguna de las palabras
# "seguridad", "protectoras" o "proteccion", eliminar ese registro.

mask_gafas = result_predictions_filtered_updated['descripcion_full'].str.lower().str.contains('gafas')
mask_gafas_no_seguridad = (
    mask_gafas &
    ~(
        result_predictions_filtered_updated['descripcion_full'].str.lower().str.contains('seguridad') |
        result_predictions_filtered_updated['descripcion_full'].str.lower().str.contains('protectoras') |
        result_predictions_filtered_updated['descripcion_full'].str.lower().str.contains('proteccion')
    )
)
# Elimina las filas que cumplen la condición
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_gafas_no_seguridad].copy()


# ===========================================
# RULES: TRANSFORMACIONES ESPECIALES
# ===========================================

# 1. Si 'descripcion_simple' contiene la palabra 'bisuteria', cambiar Predicted_familia y Predicted_nombre_categoria a 'OTROS'
mask_bisuteria = result_predictions_filtered_updated['descripcion_simple'].str.lower().str.contains('bisuteria')
result_predictions_filtered_updated.loc[mask_bisuteria, 'Predicted_familia'] = 'OTROS'
result_predictions_filtered_updated.loc[mask_bisuteria, 'Predicted_nombre_categoria'] = 'OTROS'

# 2. Si 'descripcion' contiene la frase 'filtro para aire', cambiar Predicted_familia y Predicted_nombre_categoria a 'OTROS'
mask_filtro_para_aire = result_predictions_filtered_updated['descripcion'].str.lower().str.contains(r'filtro\s+para\s+aire')
result_predictions_filtered_updated.loc[mask_filtro_para_aire, 'Predicted_familia'] = 'OTROS'
result_predictions_filtered_updated.loc[mask_filtro_para_aire, 'Predicted_nombre_categoria'] = 'OTROS'

# 3. Si 'descripcion' contiene la frase 'filtro de aire', 'filtro aire' o 'filtro paire' (pero NO "filtro para aire"),
#    cambiar Predicted_nombre_categoria a 'AUTOMOTRIZ' y Predicted_familia a 'REPUESTOS'
mask_filtro_automotriz = (
    result_predictions_filtered_updated['descripcion'].str.lower().str.contains(r'filtro\s+de\s+aire|filtro\s+aire|filtro\s+paire')
    & ~mask_filtro_para_aire
)
result_predictions_filtered_updated.loc[mask_filtro_automotriz, 'Predicted_nombre_categoria'] = 'AUTOMOTRIZ'
result_predictions_filtered_updated.loc[mask_filtro_automotriz, 'Predicted_familia'] = 'REPUESTOS'

# ===========================================
# FILTER: FILTRO SIN AGUA SOLO EN COCINAS
# ===========================================

# Mascara: nombre_categoria es COCINAS
mask_cocinas = result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'COCINAS'

# Mascara: descripcion contiene 'filtro' o 'filtros' (palabra completa, singular o plural)
mask_filtro = result_predictions_filtered_updated['descripcion'].str.lower().str.contains(r'\bfiltros?\b', na=False)

# Mascara: descripcion contiene 'agua'
mask_agua = result_predictions_filtered_updated['descripcion'].str.lower().str.contains('agua', na=False)

# Mascara final: Es COCINAS, contiene 'filtro' o 'filtros', y NO contiene 'agua'
mask_drop = mask_cocinas & mask_filtro & ~mask_agua

# Eliminar esas filas
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_drop].copy()

# ===========================================
# FILTER: DROP filas con cod_arancelario 6201300000 y Predicted_nombre_categoria SEGURIDAD
# ===========================================

mask_drop_arancel_seguridad = (
    (result_predictions_filtered_updated['cod_arancelario'] == 6201300000) &
    (result_predictions_filtered_updated['Predicted_nombre_categoria'] == 'SEGURIDAD')
)
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_drop_arancel_seguridad].copy()

# ===========================================
# FILTRO: Prendas en SEGURIDAD sin "seguridad" en descripcion_full
# ===========================================

# Define las palabras clave de prendas
prendas = ['abrigo', 'sueter', 'camisa', 'chaqueta']

# Mascara: descripcion contiene alguna prenda
mask_prenda = result_predictions_filtered_updated['descripcion'].str.lower().str.contains('|'.join(prendas), na=False)

# Mascara: Predicted_nombre_categoria es SEGURIDAD
mask_seguridad = result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'SEGURIDAD'

# Mascara: descripcion_full contiene "seguridad"
mask_desc_full_seguridad = result_predictions_filtered_updated['descripcion_full'].str.lower().str.contains('seguridad', na=False)

# Condición final: Es prenda, categoría SEGURIDAD, y NO dice "seguridad" en descripcion_full
mask_drop = mask_prenda & mask_seguridad & ~mask_desc_full_seguridad

# Elimina esas filas
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_drop].copy()

# ===========================================
# FILTRO: Drop filas donde cod_arancelario empieza con 640 y Predicted_nombre_categoria es MUEBLES
# ===========================================

mask_drop_640_muebles = (
    result_predictions_filtered_updated['cod_arancelario'].astype(str).str.startswith('640') &
    (result_predictions_filtered_updated['Predicted_nombre_categoria'].str.upper() == 'MUEBLES')
)
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_drop_640_muebles].copy()


# ===========================================
# FILTRO: Drop filas donde cod_arancelario empieza con 420 y Predicted_familia es CAJAS
# ===========================================

mask_drop_420_cajas = (
    result_predictions_filtered_updated['cod_arancelario'].astype(str).str.startswith('420') &
    (result_predictions_filtered_updated['Predicted_familia'].str.upper() == 'CAJAS')
)
result_predictions_filtered_updated = result_predictions_filtered_updated[~mask_drop_420_cajas].copy()


# Mostrar resultados
print("\nDataFrame después de aplicar los filtros:")
print(f"Registros eliminados: {total_mask.sum()}")
print(f"Registros restantes: {len(result_predictions_filtered_updated)}")
print(result_predictions_filtered_updated.head())


DataFrame después de aplicar los filtros:
Registros eliminados: 274725
Registros restantes: 1739394
          id       fecha cod_arancelario  \
5   23725393  2024-01-16      9608600000   
7   23223189  2024-01-16      8536101099   
9   22724002  2024-01-16      8509809000   
10  23366749  2024-01-16      9506320000   
11  24167725  2024-01-16      8504400050   

                              descripcion   id_key  \
5               MIA000040582816 CARTUCHOS  1648863   
7                 MIA000040596385 FUSIBLE  1145955   
9   MIA000040598803 EXPRIMIDORA DE FRUTAS   646644   
10    MIA000040598032 PAQUETES DE PAJAROS  1290014   
11    MIA000040593334 CARGADOR ELECTRICOS  2091325   

                       descripcion_simple  \
5               MIA000040582816 CARTUCHOS   
7                 MIA000040596385 FUSIBLE   
9   MIA000040598803 EXPRIMIDORA DE FRUTAS   
10    MIA000040598032 PAQUETES DE PAJAROS   
11    MIA000040593334 CARGADOR ELECTRICOS   

                                     d

# ***CREACION DE TABLA EN REPOSITORIO***

In [23]:
import os
import pandas as pd
from datetime import datetime


# Define la ruta de la carpeta 'tablas' dentro de tu directorio de proyecto
tablas_dir = os.path.join(project_dir, 'tablas')

# Asegúrate de que la carpeta 'tablas' exista, si no, créala
if not os.path.exists(tablas_dir):
    os.makedirs(tablas_dir)
    print(f"Carpeta '{tablas_dir}' creada.")

# Define el nombre base del archivo
nombre_base_archivo = '2024prediction_table_inference_data'
extension = '.csv' # Puedes cambiar a '.xlsx' si prefieres Excel

# Inicializa el número de versión
numero_version = 1
nombre_archivo = f"{nombre_base_archivo}{extension}"
ruta_completa_archivo = os.path.join(tablas_dir, nombre_archivo)

# Verifica si el archivo ya existe y genera un nuevo nombre si es necesario
while os.path.exists(ruta_completa_archivo):
    numero_version += 1
    nombre_archivo = f"{nombre_base_archivo}_prediccion_{numero_version}{extension}"
    ruta_completa_archivo = os.path.join(tablas_dir, nombre_archivo)


# Add the current date as a new column
result_predictions_filtered_updated['inference_date'] = datetime.now().strftime('%Y-%m-%d')



# Guarda el DataFrame en el archivo con el nombre determinado
try:
    result_predictions_filtered_updated.to_csv(ruta_completa_archivo, index=False) # Usa to_excel si la extensión es .xlsx
    print(f"Tabla guardada exitosamente en: {ruta_completa_archivo}")
except Exception as e:
    print(f"Error al guardar la tabla: {e}")

Tabla guardada exitosamente en: /content/drive/MyDrive/clasificacion_2_niveles/laboratory_2/tablas/2024prediction_table_inference_data.csv


In [24]:
# prompt: calculate cobertura again in result_predictions_filtered_updated, here is more easy becouse is teht otal in the current table divide by the total of prediction_table_inference_data

# Calculate Cobertura for the filtered predictions
cobertura_filtered = len(result_predictions_filtered_updated)
total_predictions = len(prediction_table_inference_data)

# Avoid division by zero
if total_predictions > 0:
    cobertura_filtered_percentage = (cobertura_filtered / total_predictions) * 100
else:
    cobertura_filtered_percentage = 0

print(f"\nCobertura after filtering by keywords and novex list:")
print(f"Number of data points remaining after filtering (Cobertura): {cobertura_filtered}")
print(f"Percentage of original predictions remaining: {cobertura_filtered_percentage:.2f}%")


Cobertura after filtering by keywords and novex list:
Number of data points remaining after filtering (Cobertura): 1739394
Percentage of original predictions remaining: 23.62%


In [25]:
result_predictions_filtered_updated

Unnamed: 0,id,fecha,cod_arancelario,descripcion,id_key,descripcion_simple,descripcion_full,descripcion_2,corpus,Predicted_nombre_categoria,Prob_nombre_categoria,Predicted_familia,Prob_familia,average_prob,inference_date
5,23725393,2024-01-16,9608600000,MIA000040582816 CARTUCHOS,1648863,MIA000040582816 CARTUCHOS,MIA000040582816 CARTUCHOS,BOLIGRAFOS; ROTULADORES Y MARCADORES CON PUNTA...,mia000040582816 cartuchos boligrafos; rotulado...,HERRAMIENTAS MANUALES,0.957529,ALBAÑILERIA,0.862612,0.910070,2025-07-21
7,23223189,2024-01-16,8536101099,MIA000040596385 FUSIBLE,1145955,MIA000040596385 FUSIBLE,MIA000040596385 FUSIBLE,"APARATOS PARA CORTE, SECCIONAMIENTO, PROTECCIO...","mia000040596385 fusible aparatos corte, seccio...",AUTOMOTRIZ,0.974787,CONSUMIBLES PARA VEHICULOS,0.885810,0.930298,2025-07-21
9,22724002,2024-01-16,8509809000,MIA000040598803 EXPRIMIDORA DE FRUTAS,646644,MIA000040598803 EXPRIMIDORA DE FRUTAS,MIA000040598803 EXPRIMIDORA DE FRUTAS CITRICAS...,APARATOS ELECTROMECANICOS CON MOTOR ELECTRICO ...,mia000040598803 exprimidora frutas aparatos el...,ELECTRODOMÉSTICOS,0.994677,MENORES,0.973240,0.983959,2025-07-21
10,23366749,2024-01-16,9506320000,MIA000040598032 PAQUETES DE PAJAROS,1290014,MIA000040598032 PAQUETES DE PAJAROS,MIA000040598032 PAQUETES DE PAJAROS DE BADMNTON,"ARTICULOS Y MATERIAL PARA CULTURA FISICA, GIMN...",mia000040598032 paquetes pajaros articulos y m...,EXTERIORES,0.843049,DEPORTES,0.963748,0.903398,2025-07-21
11,24167725,2024-01-16,8504400050,MIA000040593334 CARGADOR ELECTRICOS,2091325,MIA000040593334 CARGADOR ELECTRICOS,MIA000040593334 CARGADOR ELECTRICOS PARA COMPU...,"TRANSFORMADORES ELECTRICOS, CONVERTIDORES ELEC...",mia000040593334 cargador electricos transforma...,AUTOMOTRIZ,0.939260,ACCESORIOS AUTOMOTRIZ,0.845508,0.892384,2025-07-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2803293,21243152,2024-08-05,7013490000,PLATOS DE VIDRO,68424053,PLATOS DE VIDRO,PLATOS DE VIDRO NA NA,"ARTICULOS DE VIDRIO PARA SERVICIO DE MESA, COC...","platos vidro articulos vidrio servicio mesa, c...",COCINAS,0.998749,VAJILLAS,0.975031,0.986890,2025-07-21
2803296,20958716,2024-08-05,8525899099,CAMARA DE SEGURIDA,68140463,CAMARA DE SEGURIDA,CAMARA DE SEGURIDA TIPO NA NA,APARATOS EMISORES DE RADIODIFUSIÓN O TELEVISIÓ...,camara segurida aparatos emisores radiodifusió...,SEGURIDAD,0.955957,CAMARAS Y ALARMAS,0.990629,0.973293,2025-07-21
2803297,21086434,2024-08-21,8525899099,CAMARA DE SEGURIDA,68267461,CAMARA DE SEGURIDA,CAMARA DE SEGURIDA TIPO NA NA,APARATOS EMISORES DE RADIODIFUSIÓN O TELEVISIÓ...,camara segurida aparatos emisores radiodifusió...,SEGURIDAD,0.955957,CAMARAS Y ALARMAS,0.990629,0.973293,2025-07-21
2803298,21930487,2024-07-03,8525899099,CAMARA DE SEGURIDA,69110467,CAMARA DE SEGURIDA,CAMARA DE SEGURIDA TIPO NA NA,APARATOS EMISORES DE RADIODIFUSIÓN O TELEVISIÓ...,camara segurida aparatos emisores radiodifusió...,SEGURIDAD,0.955957,CAMARAS Y ALARMAS,0.990629,0.973293,2025-07-21
