Versión realizada con Tensor Flow y GBT.

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf

print(f"Found TF-DF {tfdf.__version__}")

train_df=pd.read_csv("data_ml/train.csv")
test_df=pd.read_csv("data_ml/test.csv")



ModuleNotFoundError: No module named 'tensorflow'

In [3]:
#Se formatean los datos.
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

In [14]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Ticket_number")
input_features.remove("Survived")
#Quitamos variables irrelevantes para el modelo. Tenemos que quitar Survived, pues es la que queremos predecir y no puede formar parte del modelo.
print(f"Input features: {input_features}")



Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_item']


In [15]:
#Hay que convertir el dataframe de Pandas a TensorFlow.
def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df,label="Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_df).map(tokenize_names)

In [16]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, 
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Que solo use las columnas especificadas en input_features.
    random_seed=1234,
)
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

[INFO 24-06-06 17:59:50.0234 CEST kernel.cc:1233] Loading model from path /tmp/tmpxr5d797c/model/ with prefix b551b46dee93458f
[INFO 24-06-06 17:59:50.0248 CEST quick_scorer_extended.cc:911] The binary was compiled without AVX2 support, but your CPU supports it. Enable it for faster model inference.
[INFO 24-06-06 17:59:50.0249 CEST abstract_model.cc:1362] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 24-06-06 17:59:50.0249 CEST kernel.cc:1061] Use fast generic engine


Accuracy: 0.8152173757553101 Loss:0.8657673597335815


2024-06-06 17:59:52.603013: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


In [17]:
model.summary()
#resumen del modelo

Model: "gradient_boosted_trees_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (10):
	Age
	Cabin
	Embarked
	Fare
	Name
	Parch
	Pclass
	Sex
	SibSp
	Ticket_item

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.         "Sex"  0.723305 ################
    2.      "Pclass"  0.263031 ##
    3.         "Age"  0.253199 ##
    4.        "Fare"  0.238181 #
    5.        "Name"  0.234637 #
    6. "Ticket_item"  0.198615 
    7.       "SibSp"  0.185129 
    8.    "Embarked"  0.183560 
    9.       "Parch"  0.175168 

Variable Importance: NUM_AS_ROOT:
    1.  "Sex" 21.000000 ################
    2. "Name"  2.000000 

Variable Importance: NUM_NODES:
   

In [26]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(serving_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path="data_ml/resultados_tf.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)
!head data_ml/resultados_tf.csv

#Se aplica el modelo a los datos de prueba.

Submission exported to data_ml/resultados_tf.csv
PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
