## Here we are going to perform the Titanic ⛴⚓️ data analysis

In [130]:
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Data Discription
<img src="data/titanic_data.png" alt="data description" width="500"/>

In [14]:
## Import the data
org_data = pd.read_csv('data/train.csv')

In [68]:
# Make the copy of the dataset
data = org_data.copy()

In [69]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [70]:
# Check weather data having stings 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [71]:
# Check weather data having missing values
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [72]:
# Lets create a function to display dataframe for the Unique Values, Total Values, Missing Values, Data Types
def data_analysis(data):
    '''
    data - Enter your data frame
    
    Calculate the data types,Total Number of Values in each column, Unique values in each column and missing values in each column 
    '''
    unique_values = {}

    for column_name, column_values in data.items():
        unique_values[column_name] = {
            'Data Type' : column_values.dtype,
            'Total Values' : len(column_values),
            'Unique Values' : len(column_values.unique()),
            'Missing Values' : column_values.isna().sum()
        }

    df = pd.DataFrame.from_dict(unique_values, orient='index')
    return df

data_analysis(data)

Unnamed: 0,Data Type,Total Values,Unique Values,Missing Values
PassengerId,int64,891,891,0
Survived,int64,891,2,0
Pclass,int64,891,3,0
Name,object,891,891,0
Sex,object,891,2,0
Age,float64,891,89,177
SibSp,int64,891,7,0
Parch,int64,891,7,0
Ticket,object,891,681,0
Fare,float64,891,248,0


In [73]:
# We know need names because they are all unique and dosen't show any effect on the model
data.drop("Name", axis=1, inplace=True)

In [111]:
# Lets convert Ticket column
import re

# Assuming 'df' is your DataFrame
def encode_ticket(df):
    # Method 1: Extract numeric part and use it as an integer
    def extract_numeric(ticket):
        # Find all numeric parts in the ticket
        numbers = re.findall(r'\d+', ticket)
        if numbers:  # If there are numbers in the ticket
            return int(numbers[-1])  # Return the last numeric part
        else:
            return None  # If no numbers are found, return None
        
    df['Ticket_Numeric'] = df['Ticket'].apply(extract_numeric)
    
    # Method 2: Hash the entire ticket string
    df['Ticket_Hash'] = df['Ticket'].apply(lambda x: hash(x) % 10**8)  # Limit to 8 digits
    
    # Method 3: Frequency encoding
    ticket_freq = df['Ticket'].value_counts(normalize=True)
    df['Ticket_Freq'] = df['Ticket'].map(ticket_freq)
    
    return df

# Apply the encoding
data = encode_ticket(data)
data[:10].T
data['Ticket_Numeric'] = data['Ticket_Numeric'].fillna(0)

In [105]:
def encode_cabin(df):
    # Extract the cabin letter and number
    df['Cabin'] = df['Cabin'].fillna('U0')  # Fill NaN with 'U0'
    df['Cabin_Letter'] = df['Cabin'].astype(str).apply(lambda x: x[0])
    df['Cabin_Number'] = df['Cabin'].astype(str).apply(lambda x: int(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else 0)
    
    # Encode the cabin letter
    letter_mapping = {letter: i for i, letter in enumerate('ABCDEFTGU', 1)}
    df['Cabin_Letter_Encoded'] = df['Cabin_Letter'].map(letter_mapping)
    
    # Combine letter and number
    df['Cabin_Encoded'] = df['Cabin_Letter_Encoded'] * 1000 + df['Cabin_Number']
    
    # Encode unknown (previously NaN) values as 0000
    df.loc[df['Cabin'] == 'U0', 'Cabin_Encoded'] = 0
    
    return df

def encode_embarked(df):
    # Simple mapping for Embarked
    embarked_mapping = {'C': 1, 'Q': 2, 'S': 3}
    df['Embarked_Encoded'] = df['Embarked'].fillna('Unknown').map(embarked_mapping)
    df['Embarked_Encoded'] = df['Embarked_Encoded'].fillna(0)  # Encode unknown (previously NaN) values as 0
    
    return df

data = encode_cabin(data)
data = encode_embarked(data)

In [114]:
data[:10].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
PassengerId,1,2,3,4,5,6,7,8,9,10
Survived,0,1,1,1,0,0,0,0,1,1
Pclass,3,1,3,1,3,3,1,3,3,2
Sex,male,female,female,female,male,male,male,male,female,female
Age,22.0,38.0,26.0,35.0,35.0,,54.0,2.0,27.0,14.0
SibSp,1,1,0,1,0,0,0,3,0,1
Parch,0,0,0,0,0,0,0,1,2,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450,330877,17463,349909,347742,237736
Fare,7.25,71.2833,7.925,53.1,8.05,8.4583,51.8625,21.075,11.1333,30.0708
Cabin,U0,C85,U0,C123,U0,U0,E46,U0,U0,U0


In [113]:
data_analysis(data)

Unnamed: 0,Data Type,Total Values,Unique Values,Missing Values
PassengerId,int64,891,891,0
Survived,int64,891,2,0
Pclass,int64,891,3,0
Sex,category,891,2,0
Age,float64,891,89,177
SibSp,int64,891,7,0
Parch,int64,891,7,0
Ticket,category,891,681,0
Fare,float64,891,248,0
Cabin,object,891,148,0


In [116]:
# So first we change the Sex and Ticket into catogeories
for column_name, column_value in data.items():
    if pd.api.types.is_string_dtype(column_value):
        data[column_name] = column_value.astype('category')
        
        data[column_name] = pd.Categorical(column_value).codes + 1 # Converting them into integers
        
data_analysis(data)

Unnamed: 0,Data Type,Total Values,Unique Values,Missing Values
PassengerId,int64,891,891,0
Survived,int64,891,2,0
Pclass,int64,891,3,0
Sex,int8,891,2,0
Age,float64,891,89,177
SibSp,int64,891,7,0
Parch,int64,891,7,0
Ticket,int16,891,681,0
Fare,float64,891,248,0
Cabin,int16,891,148,0


In [123]:
# Converting the hash as intergers
data['Ticket_Hash'] = data['Ticket_Hash'].cat.codes

In [125]:
data.drop('Embarked', axis=1, inplace=True)

In [131]:
data['Age'] = data['Age'].fillna(data.Age.median())

In [132]:
data_analysis(data)

Unnamed: 0,Data Type,Total Values,Unique Values,Missing Values
PassengerId,int64,891,891,0
Survived,int64,891,2,0
Pclass,int64,891,3,0
Sex,int8,891,2,0
Age,float64,891,88,0
SibSp,int64,891,7,0
Parch,int64,891,7,0
Ticket,int16,891,681,0
Fare,float64,891,248,0
Cabin,int16,891,148,0


In [133]:
# Splitting the data into train and valid
np.random.seed(42)

x = data.drop('Survived', axis=1)
y = data['Survived']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

### Building the model

In [134]:
model1 = GradientBoostingClassifier()
model1.fit(x_train, y_train)
model1.score(x_valid, y_valid)

0.8044692737430168

In [135]:
model2 = RandomForestClassifier()
model2.fit(x_train, y_train)
model2.score(x_valid, y_valid)

0.8324022346368715

In [139]:
# Building a help function for plotting the above values
def plot_features(columns, importance, n=20):
    # Creating a dataframe for importance
    df_imp = (pd.DataFrame({'features': columns,
                           'feature_importances': importance})
             .sort_values('feature_importances', ascending=False)
             .reset_index(drop=True))
    
    # Plotting the data
    fig, ax = plt.subplots(figsize=(10,5))
    ax.barh(df_imp['features'][:n], df_imp['feature_importances'][:n])
    ax.set_ylabel("features")
    ax.set_xlabel("featuer_importance")
    ax.invert_yaxis()
    

In [140]:
plot_features(x_train.columns, model1.feature_importances_)

In [141]:
model1.feature_importances_

array([2.44953379e-02, 1.07965361e-01, 4.10870617e-01, 9.19709697e-02,
       2.09936951e-02, 5.91148355e-04, 4.16734665e-02, 6.66566403e-02,
       2.60227170e-02, 5.55557165e-02, 4.35635957e-02, 2.72931262e-02,
       0.00000000e+00, 1.40531940e-02, 3.87620712e-05, 5.43568158e-02,
       1.38988361e-02])