# Neural Network AI - Titanic Survivors prediction - Classification problem.

Credit to dataset: ...

## 0. Import neccessary libraries

In [1]:
import tensorflow as tf
import sklearn as sk
import keras.layers
import pandas as pd
import numpy as np

## A. Prepare the data.

In [2]:
#Create train and test dataframes (for plotting)
test_dataframe = pd.read_csv('..\datasets\Titanic_Survivors\\test.csv',quotechar='"')
train_dataframe = pd.read_csv('..\datasets\Titanic_Survivors\\train.csv',quotechar='"')
#Replace NaN values with 0
test_dataframe.fillna(float(0), inplace=True)
train_dataframe.fillna(float(0), inplace=True)

In [3]:
test_dataframe.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0.0,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0.0,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0.0,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0.0,S


In [4]:
#Create train and test dataset
test_dataset = pd.DataFrame.to_numpy(test_dataframe)
train_dataset = pd.DataFrame.to_numpy(train_dataframe)

#Split the training dataset to data and targets
targets = train_dataset[:, 1]
train_dataset = np.delete(train_dataset, 1, 1)

pd.DataFrame(train_dataset)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0.0,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0.0,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0.0,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,0.0,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.45,0.0,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C


In [None]:
"""
from sklearn.ensemble import RandomForestClassifier

y = train_dataframe["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch","Fare"]
X = pd.get_dummies(train_dataframe[features], dummy_na=True)
X_test = pd.get_dummies(test_dataframe[features], dummy_na=True)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_dataframe.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
"""

### Embedding string values (Name, Ticket and Sex)
- Depending on who you are, how well-known is your name or your family name, the chances of you surviving can be higher.
- Furthermore, being in a group (ie. family) can increase your survival.
- So we need to somehow categorize names of each individuals to better improve the model.

>ps: i just want to try embedding strings ok...

### 1. Sex - Categorical Encoding
- We need to transfer the words "male" and "female" to numeric representations for the AI model to work with.
- For this, we use OneHotEncoder - which maps each unique label to an integer value.

In [5]:
from sklearn.preprocessing import OrdinalEncoder
#Insantiate the encoder
sex_encoder = OrdinalEncoder()
#Get the sex attribute column from train dataset
sex_Col = train_dataset[:,3].reshape(-1,1)
sex_Col
#Fit and transform the data, then flatten the output since OrdinalEncoder outputs 2-D arrays
enc_sex = sex_encoder.fit_transform(sex_Col).flatten()

In [6]:
train_dataset[:, 3] = enc_sex

train_dataset

array([[1, 3, 'Braund, Mr. Owen Harris', ..., 7.25, 0.0, 'S'],
       [2, 1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', ...,
        71.2833, 'C85', 'C'],
       [3, 3, 'Heikkinen, Miss. Laina', ..., 7.925, 0.0, 'S'],
       ...,
       [889, 3, 'Johnston, Miss. Catherine Helen "Carrie"', ..., 23.45,
        0.0, 'S'],
       [890, 1, 'Behr, Mr. Karl Howell', ..., 30.0, 'C148', 'C'],
       [891, 3, 'Dooley, Mr. Patrick', ..., 7.75, 0.0, 'Q']], dtype=object)

~~### 2. Ticket - Tf/Idf encoder~~
~~- Tickets are usually fairly unique to each other.~~
~~- So, to save memory but still preserve some sense of relationship that the model can figure out, we use **Tf/Idf Encoder**.~~
~~- As in, **Term Frequency** and **Inverse Document Frequency**.~~
~~- The encoder works by mapping every unique word to a real value, the more the word occurs in the document, the higher its "score".~~
~~- But it also minimizes words that are meaningless in terms of learning - like 'the','a',etc.~~

### 2. Ticket - Hashing Vectorizer
- We will hash all words into a hashing table, with corresponding hash values.
- **Problem with TF/IDF**: The shape of the output vector between train_dataset and test_dataset can be different (because their vocabulary is different)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

#Insantiate the encoder/vectorizer
#ticket_vectorizer = TfidfVectorizer(analyzer="word")
ticket_vectorizer = HashingVectorizer(n_features=2000)
#Get the ticket attribute column from train dataset
ticketCol = train_dataset[:, 7]
#Fit the training data to the vectorizer
ticket_vectorizer.fit(ticketCol)
enc_ticket = ticket_vectorizer.transform(ticketCol)
enc_ticket = enc_ticket.toarray()
enc_ticket

In [None]:
#Concatenate the new vectors to train dataset
train_dataset = np.concatenate((train_dataset, enc_ticket), axis=1)

train_dataset

### 3. Name

In [None]:
#Insantiate the encoder/vectorizer
#name_vectorizer = TfidfVectorizer(analyzer="word")
name_vectorizer = HashingVectorizer(n_features=2000)
#Get the ticket attribute column from train dataset
nameCol = train_dataset[:, 2]
#Fit the training data to the vectorizer
name_vectorizer.fit(nameCol)
enc_name = name_vectorizer.transform(nameCol)
enc_name = enc_name.toarray()
enc_name

In [None]:
#Concatenate the new vectors to train dataset
train_dataset = np.concatenate((train_dataset, enc_name), axis=1)

train_dataset

### 4. Embarked

In [9]:
#Insantiate the encoder
emb_encoder = OrdinalEncoder()
#Get the data
emb_col = train_dataset[:, 10].reshape(-1 , 1).astype(str)
enc_emb = emb_encoder.fit_transform(emb_col).flatten()

In [11]:
train_dataset[:,10] = enc_emb

train_dataset

array([[1, 3, 'Braund, Mr. Owen Harris', ..., 7.25, 0.0, 3.0],
       [2, 1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', ...,
        71.2833, 'C85', 1.0],
       [3, 3, 'Heikkinen, Miss. Laina', ..., 7.925, 0.0, 3.0],
       ...,
       [889, 3, 'Johnston, Miss. Catherine Helen "Carrie"', ..., 23.45,
        0.0, 3.0],
       [890, 1, 'Behr, Mr. Karl Howell', ..., 30.0, 'C148', 1.0],
       [891, 3, 'Dooley, Mr. Patrick', ..., 7.75, 0.0, 2.0]], dtype=object)

### Let's put everything into a callable class

In [14]:
class Preprocessor:
    def categoricalEncode(self, dataset, column):
        encoder = OrdinalEncoder()
        #Get the sex attribute column from train dataset
        data = dataset[:, column].reshape(-1,1).astype(str)
        #Fit and transform the data, then flatten the output since OrdinalEncoder outputs 2-D arrays
        enc_data = encoder.fit_transform(data).flatten()
        dataset[:, column] = enc_data
        return dataset

    def HashingEncode(self, dataset, column, n_features=2000):
        #Insantiate the encoder/vectorizer
        vectorizer = HashingVectorizer(n_features=n_features)
        vectorizer = name_vectorizer if column == 2 else ticket_vectorizer
        #Get the ticket attribute column from train dataset
        data = dataset[:, column]
        #Fit the training data to the vectorizer
        #vectorizer.fit(data)
        enc_data = vectorizer.transform(data)
        enc_data = enc_data.toarray()
        
        dataset = np.concatenate((dataset, enc_data), axis=1)
        return dataset
    
    def preprocess(self, dataset, categorical_columns = None, tf_idf_columns = None):
        
        #Categorical encoding
        if categorical_columns == None:
            #Sex
            dataset = self.categoricalEncode(dataset, 3)
            #Embarked
            dataset = self.categoricalEncode(dataset, 10)
        else:
            for col in categorical_columns:
                dataset = self.categoricalEncode(dataset, col)
        
        #Feature extraction (tf_idf) encoding
        if tf_idf_columns == None:
            #Name
            dataset = self.HashingEncode(dataset, 2)
            #Ticket
            dataset = self.HashingEncode(dataset, 7) 
        else:
            for col in tf_idf_columns:
                dataset = self.HashingEncode(dataset, col)

        return dataset

Now let's call the processor class to preprocess our test_dataset!

In [16]:
test_dataset[0]

array([892, 3, 'Kelly, Mr. James', 1.0, 34.5, 0, 0, '330911', 7.8292, 0.0,
       1.0], dtype=object)

In [15]:
processor = Preprocessor()

test_dataset = processor.categoricalEncode(test_dataset, 3)

test_dataset = processor.categoricalEncode(test_dataset, 10)

## B. Build the model.

In [19]:
test_dataframe

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0.0,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,0.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0.0,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0.0,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0.0,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,0.0,0,0,A.5. 3236,8.0500,0.0,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,0.0,S
416,1308,3,"Ware, Mr. Frederick",male,0.0,0,0,359309,8.0500,0.0,S


In [20]:
delete_features = [0, 2, 7, 9] #Omit "ID", "Cabin", old "Name", old "Ticket" and "Embarked" feature

train_dataset = np.delete(train_dataset, delete_features, axis=1)
test_dataset = np.delete(test_dataset, delete_features, axis=1)

In [21]:
#Convert dtype to float64
train_dataset = train_dataset.astype(np.float64)
test_dataset = test_dataset.astype(np.float64)
targets = targets.astype(np.float64)

In [22]:
#Layer to normalize training data
normalizer = tf.keras.layers.Normalization(axis=-1)

#Adapt the normalizer to data
normalizer.adapt(train_dataset)

print(normalizer.mean.numpy())

[[ 2.308642    0.64758706 23.799297    0.5230078   0.38159373 32.20421
   2.5297413 ]]


In [28]:
linear_model = tf.keras.Sequential([
                                    normalizer,
                                    tf.keras.layers.Dense(len(train_dataset[0]), activation='relu'), #You should have an input layer with the size of the input vector
                                    tf.keras.layers.Dense(1)],
                                    )

In [29]:
linear_model.predict(train_dataset[:10])



array([[ 0.28106833],
       [ 1.1892489 ],
       [-0.29475987],
       [-0.18169472],
       [ 0.6368168 ],
       [ 0.8815734 ],
       [ 0.5167075 ],
       [-0.3050456 ],
       [-0.15823054],
       [ 1.1784658 ]], dtype=float32)

In [30]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss=tf.keras.losses.MeanSquaredError()
)

linear_model.fit(
    train_dataset,
    targets,
    epochs=50,
    validation_split=0.2,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x214121a9890>

In [31]:
len(test_dataset[0]) == len(train_dataset[0])

True

In [32]:
#Predict test dataset
predictions = linear_model.predict(test_dataset)

#Some adjustments so there'd be no problem submitting
for member in predictions:
    if member[0] > 0.5:
        member[0] = 1
    else:
        member[0] = 0

predictions = predictions.astype(np.int32)
predictions = pd.DataFrame(predictions)
predictions



Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


In [33]:
output = pd.DataFrame({'PassengerId': test_dataframe.PassengerId, 'Survived': predictions[0]})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
