# Todo

In [1]:
import pandas as pd
import json

def create_dataframe_from_json_file(file_path):
    records = []
    
    with open(file_path, 'r') as file:
        for line in file:
            try:
                data = json.loads(line.strip())
                
                record = {"uuid": data["uuid"]}
                
                for key, value in data["demographic"].items():
                    if key != "uuid": 
                        record[key] = value
                
                # Agregar campos de historic.products
                for key, value in data["historic"]["products"].items():
                    record[key] = value
                
                # Agregar value
                record["value"] = data["value"]
                
                # Agregar el registro a la lista
                records.append(record)
            except json.JSONDecodeError as e:
                print(f"Error al analizar el JSON: {e}")
                print(f"Línea problemática: {line[:100]}...") 
    
    # Crear el dataframe a partir de los registros
    df = pd.DataFrame(records)
    
    return df

In [2]:
y_test = pd.read_csv('y_test.csv')
y_train = pd.read_csv('y_train.csv')

full = pd.concat([y_test, y_train])
df_salida = create_dataframe_from_json_file("salida.txt")

In [None]:
for _, row in df_salida.iterrows():
    idx = row["uuid"]
    print(full["idx"])
    print(idx)
    break
        


#id_objetivo = "418034"
#df_salida.loc[df_salida['uuid'] == id_objetivo].to_dict(orient='records')[0]

0         104241
1         199676
2         140199
3         132814
4         408697
           ...  
334995    259178
334996    365838
334997    131932
334998    146867
334999    121958
Name: idx, Length: 500000, dtype: int64
418034


In [6]:
#Elemento de df_salida donde uuid es igual a 418034:
id_objetivo = "418034"
predicho = df_salida.loc[df_salida['uuid'] == id_objetivo]
real = full.loc[full['idx'] == int(id_objetivo)]
print(predicho)
print(real)

     uuid  age  man  woman  cat22  u5  u6  cat49  cat27  cat76  ...  cat53  \
0  418034   53    1      0      0   0   0      0      0      0  ...      0   

   cat93  cat82  cat42  cat20  cat96  cat60  cat31  cat85  value  
0      0      0      0      1      0      0      0      0      1  

[1 rows x 115 columns]
           idx  label
333710  418034      1


In [3]:
df_salida.head()

Unnamed: 0,uuid,age,man,woman,cat22,u5,u6,cat49,cat27,cat76,...,cat53,cat93,cat82,cat42,cat20,cat96,cat60,cat31,cat85,value
0,418034,53,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,240197,38,1,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,1
2,31027,22,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,249216,47,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,357882,30,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1


# Solo salida

In [1]:
import pandas
#reads y_test.csv
y_test = pandas.read_csv('y_test.csv')
y_train = pandas.read_csv('y_train.csv')
test = pandas.read_csv('test.csv')

#joins y_test and y_train
full = pandas.concat([y_test, y_train])

In [2]:
full.columns

Index(['idx', 'label'], dtype='object')

In [4]:
#For each value in test, checks if column ["idx"] exists in full
#If it does, compares if the value in test is the same as the value in full
accuracy = 0
total_predictions = 0
for idx, row in test.iterrows():
    if idx in full.index:
        full_value = full.loc[idx, "label"]
        if isinstance(full_value, pandas.core.series.Series):
            full_value = full_value.values[0]
        if full_value == row["label"]:
            accuracy += 1
        total_predictions += 1

accuracy = accuracy / total_predictions
print(len(test), total_predictions)
print(accuracy)

4077 4077
0.5020848663232769


In [None]:
import pandas as pd
from pypmml import Model

def evaluar_modelo_pmml(pmml_file, x_train_file, y_train_file):
    model = Model.fromFile(pmml_file)

    # 2. Cargar los datos de entrenamiento
    X_train = pd.read_csv(x_train_file)
    y_train = pd.read_csv(y_train_file)
    
    expected_features = model.inputNames()
    if not all(col in X_train.columns for col in expected_features):
        missing_cols = set(expected_features) - set(X_train.columns)
        raise ValueError(f"Faltan las siguientes columnas en X_train: {missing_cols}")
    
    X_train = X_train[expected_features]

    X_train = X_train.set_index('idx')
    y_train = y_train.set_index('idx')
            
    predictions = model.predict(X_train)
    results = pd.DataFrame(predictions)

    if results.columns.tolist() == list(range(len(results.columns))):
        if len(results.columns) == 1:
            results.columns = ['predicted_target']
        else:
            results.columns = [f'predicted_output_{i}' for i in range(len(results.columns))]

    if 'idx' in y_train.index.name:  # si hay indices.
        results.index.name = 'idx'
        results = results.merge(y_train, left_index=True, right_index=True, how='left') #Unir con los valores reales.
        
    elif not y_train.empty: # si no hay indices, pero sí y_train
        results['target'] = y_train['target'].values # añade los valores verdaderos

    return results


if __name__ == '__main__':
    # Ejemplo de uso
    pmml_file = 'model.pmml'  # Reemplaza con la ruta a tu archivo PMML
    x_train_file = 'X_train_selected.csv'
    y_train_file = 'test.csv'

    predictions_df = evaluar_modelo_pmml(pmml_file, x_train_file, y_train_file)

    if predictions_df is not None:
        print("Predicciones:")
        print(predictions_df.head())  # Mostrar las primeras filas de las predicciones
        if 'target' in predictions_df.columns and 'predicted_target' in predictions_df.columns:
            from sklearn.metrics import mean_squared_error
            mse = mean_squared_error(predictions_df['target'], predictions_df['predicted_target'])
            print(f"Error cuadrático medio (MSE): {mse}")