In [1]:
import numpy as np 
import pandas as pd
import re

# Data Exploration

Variable Meanings
* survival - Survival - 0 = No, 1 = Yes
* pclass - Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd
* sex - Sex
* Age - Age in year
* sibsp - # of siblings / spouses aboard the Titanic
* parch - # of parents / children aboard the Titanic
* ticket - Ticket number
* fare - Passenger fare
* cabin - Cabin number 
* embarked - Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton

In [2]:
dataset = pd.read_csv('/kaggle/input/titanic/train.csv')
dataset = dataset.set_index("PassengerId")
dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


Data to clean:

* Normalize names
    * Delete AKA
    * Remove Miss, Mr, etc.
* Normalize Ticket
* Use Mean value for NAN Ages?
* Delete Cabin since has the less Non-Null values

In [4]:
df = dataset.drop(["Cabin"],axis=1)
df["Age"] = df["Age"].fillna(df['Age'].mean())

Get Unique Mr, Ms, etc for deletion

In [5]:
a = []
for i in df["Name"]:
    for data in re.findall(r'\b\w+\.', i):
        a.append(data)
drop = set(a)

In [6]:
import re

def normalizeName(texto):

    def deleteParentesis(texto):
        return re.sub(r'\([^)]*\)', '', texto)

    def deleteComillas(texto):
        return re.sub(r'\"[^\"]*\"', '', texto)

    def deleteComma(texto):
        texto = texto.replace(',','')
        return texto

    def deleteMrMsEtc(texto):
        texto_separado = texto.split(r' ')
        return " ".join(txt for txt in texto_separado if not txt in drop )
    
    texto = deleteParentesis(texto)
    texto = deleteComillas(texto)
    texto = deleteComma(texto)
    texto = deleteMrMsEtc(texto)
    return texto
    

df["Name"] = df["Name"].apply(normalizeName)
df["Name"]

PassengerId
1             Braund Owen Harris
2          Cumings John Bradley 
3                Heikkinen Laina
4        Futrelle Jacques Heath 
5            Allen William Henry
                 ...            
887              Montvila Juozas
888        Graham Margaret Edith
889    Johnston Catherine Helen 
890             Behr Karl Howell
891               Dooley Patrick
Name: Name, Length: 891, dtype: object

In [7]:
def normalizeName(texto):

    def deleteParentesis(texto):
        return re.sub(r'\([^)]*\)', '', texto)

    def deleteComillas(texto):
        return re.sub(r'\"[^\"]*\"', '', texto)

    def deleteComma(texto):
        texto = texto.replace(',','')
        return texto

    def deleteMrMsEtc(texto):
        texto_separado = texto.split(r' ')
        return " ".join(txt for txt in texto_separado if not txt in drop )
    
    texto = deleteParentesis(texto)
    texto = deleteComillas(texto)
    texto = deleteComma(texto)
    texto = deleteMrMsEtc(texto)
    return texto

def TitanicDataClean(df=None):
    
    #Dictionary containing relevant data of dataframe.
    extradata = {}
    if not isinstance(df,pd.DataFrame):
        raise ValueError('Must provide a Pandas dataframe')
        
    extradata['mean_age'] = df['Age'].mean()
    df = df.drop(["Cabin"],axis=1)
    df["Age"] = df["Age"].fillna(df['Age'].median())
    
    
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['IsAlone'] = (df['FamilySize'] == 0).astype(int)
    # Codificar la variable categórica 'Sex' a valores numéricos
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    df["Name"] = df["Name"].apply(normalizeName)
    return df,extradata

TitanicDataClean(dataset)

(             Survived  Pclass                       Name  Sex   Age  SibSp  \
 PassengerId                                                                  
 1                   0       3         Braund Owen Harris    0  22.0      1   
 2                   1       1      Cumings John Bradley     1  38.0      1   
 3                   1       3            Heikkinen Laina    1  26.0      0   
 4                   1       1    Futrelle Jacques Heath     1  35.0      1   
 5                   0       3        Allen William Henry    0  35.0      0   
 ...               ...     ...                        ...  ...   ...    ...   
 887                 0       2            Montvila Juozas    0  27.0      0   
 888                 1       1      Graham Margaret Edith    1  19.0      0   
 889                 0       3  Johnston Catherine Helen     1  28.0      1   
 890                 1       1           Behr Karl Howell    0  26.0      0   
 891                 0       3             Dooley Pa