In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) # prints the full path of each file
import tensorflow as tf
import tensorflow_decision_forests as tfdf

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


**Load DataSet**

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
serving_df = pd.read_csv("/kaggle/input/titanic/test.csv")

train_df.head(5)

In [None]:
#Prepare dataset:
#Tokenize the name
#"Braund, Mr. Owen Harris" will become ["Braund", "Mr.", "Owen", "Harris"]
#Extract any prefix ticket
#"STON/O2. 3101282" will become "STON/O2. and 310282

In [None]:
#function that takes in a dataframe df as input
def preprocess(df):

    #take copy of data
    df = df.copy()

    #strip special characters, join in single string w spaces
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    #extracts last part of ticket
    def ticket_number(x):
        return x.split(" ")[-1]
    #extracts first part of ticket, in not then 'none'
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    #new dataframe
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df

#run preproccess function above on training and testing functions
preprocessed_train_df = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)

preprocessed_train_df.head(5)

In [None]:
#remove some features: we dont want train model on passenegrID,and ticket
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
#input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

In [None]:
def tokenize_names(features, labels=None):
    #Divde the names into individual tokens
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

#tfdf.keras.pd_dataframe_to_tf_dataset : converts pandas df to tf df
#label = survived : Specifies the label column for supervised learning
#Applies the tokenize_names function to each row in the dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df,label="Survived").map(tokenize_names)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(tokenize_names)

In [None]:
#First training a GradientBoostedTreesModel model with the default parameterst


#initialize model

#keras.GradientBoostedTreesModel: Ensemble learning algorithm combining decision trees
#works by iteratively building trees to minimize the error of previous trees
model = tfdf.keras.GradientBoostedTreesModel(
    
    #Reduces amount of logging informationfor simplicity
    verbose=0,
    
    #Specifies the features to use for training
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    #other features ignored
    exclude_non_specified_features=True,
    #Sets a fixed random seed for reproducibility
    random_seed=1234,
)

#Trains the Gradient Boosted Trees model on the training dataset 
model.fit(train_ds)

#Evaluates the model's performance using the training dataset and prints
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

In [None]:
#Train model with improved default parameters

model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, 
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True,
    
    #Enables computation of permutation-based feature importance
    #slower but provides a deep understanding of the feature contributions
    #compute_permutation_variable_importance=True,

    # Uncommenting this could override the manual parameters
    # Change the default hyper-parameters
    # hyperparameter_template="benchmark_rank1@v1",
    
    #num_trees=1000,
    #tuner=tuner

    # Sets the min number of examples node for further splitting. Lower values more granular splits
    min_examples=1,

    #selects splits based on randomized subsets of categories
    categorical_algorithm="RANDOM",

    #Limits the depth of the trees to control overfitting
    #max_depth=4,
    
    #aka learning rate: contribution of each tree to final prediction. Smaller = more accurate, slower training
    shrinkage=0.05,

    #proportion of features considered for each split, reduces training time
    #num_candidate_attributes_ratio=0.2,

    #creates splits that involve linear combinations of features. To handle high-dimensional data effectively
    split_axis="SPARSE_OBLIQUE",
    
    #normalize data using min_max metho data before splitting
    sparse_oblique_normalization="MIN_MAX",
    
    #higher val = complex splits,capture more subtle patterns
    sparse_oblique_num_projections_exponent=2.0,

    #number of trees in ensemble, incr accuracy
    num_trees=2000,

    #ratio of training data reserved for validation
    #validation_ratio=0.0,
    
    random_seed=1234,
    )

#train model
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

In [None]:
model.summary()

In [None]:
#Predictions
def prediction_format(model, threshold=0.5):
    pb_survive = model.predict(test_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        #cutoff probability = threshold = 0.5
        "Survived": (pb_survive >= threshold).astype(int)
    })
    
def make_submission(predictions):
    path="/kaggle/working/submission.csv"
    predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

predictions = prediction_format(model)
make_submission(predictions)
!head /kaggle/working/submission.csv