In [34]:
import pandas as pd
import numpy as np
import seaborn as sns

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [12]:
#titanic = titanic.dropna()

## Generar un modelo de Regresión Logística

In [35]:
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

**StandardScaler**-> standardize the features (or variables) in a dataset.

**OneHotEncoder** -> convert categorical variables. Categorical variables are variables that have a finite set of possible values ("small", "medium", "large"). If it's "small" is 1 otherwise is 0. If it's "medium" is 1 otherwise 0. If it's "large" is 1 otherwise is 0.  

**ColumnTransformer** -> dataset with a mix of feature types, such as numerical, categorical, and text data, and you need to apply different preprocessing steps to each type of feature.

**Pipeline**-> sequence of data preprocessing and model training steps, and then apply this sequence to new data in a seamless and efficient manner.



In [46]:
# Preprocess the data
numeric_features = ['age', 'fare']
categorical_features = ['sex', 'embarked']

X = titanic[numeric_features + categorical_features] # DataFrame
y = titanic['survived'] # y is the target variable

# CREATE IMPUTERS FOR NUMERICAL & CATEGORICAL FEAUTURES
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

numeric_transformer = Pipeline(steps=[
    ('imputer', numeric_imputer),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# PIPELINE
model = Pipeline([
    ('preprocess', preprocessor),
    ('model', LogisticRegression())
])

## Evaluarlo con un K-Fold de K=5

In [47]:
from sklearn.model_selection import cross_val_score

In [48]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring= "accuracy")

print('Cross-validation scores:', cv_scores)
print('Mean cross-validation score:', cv_scores.mean())

Cross-validation scores: [0.79329609 0.80337079 0.75842697 0.75280899 0.79213483]
Mean cross-validation score: 0.7800075324838366


## Evaluar la accuracy y el classification report con un train/test

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [50]:
# TRAINING AND TESTS
X = titanic[numeric_features + categorical_features]
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# FIT
model.fit(X_train, y_train)

# EVALUATE
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:\n', report)

Accuracy: 0.776536312849162
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.81       105
           1       0.74      0.70      0.72        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

