# Neural Networks

### using titanic dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# import standardScal

dataset = pd.read_csv('../Datasets/titanic/train.csv')

test_dataset = pd.read_csv('../Datasets/titanic/test.csv')


x_train = dataset.drop('PassengerId', axis=1)

y_train = x_train[['Survived']]

x_train.drop('Name', axis=1, inplace=True)
x_train.drop('Ticket', axis=1, inplace=True)
x_train.drop('Cabin', axis=1, inplace=True)
x_train.drop('Survived', axis=1, inplace=True)
x_train['Sex'] = x_train['Sex'].map({"male": 0, "female": 1})

age_bins = [0, 12, 18, 65, 100]
age_labels = ['child', 'teen', 'adult', 'senior']


x_train['AgeGroup'] = pd.cut(x_train['Age'], bins=age_bins, labels=age_labels)

x_train['AgeGroup'] = x_train['AgeGroup'].cat.codes

fare_bins = [-1, 10, 50, 100, 1000]
fare_labels = ['cheapest', 'economy', 'premium', 'luxury']

x_train['FareGroup'] = pd.cut(x_train['Fare'], bins=fare_bins, labels=fare_labels)

x_train['FareGroup'] = x_train['FareGroup'].cat.codes

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

embarked_encoded = encoder.fit_transform(x_train[['Embarked']])

encoded_cols = encoder.get_feature_names_out(['Embarked'])
x_train[encoded_cols] = embarked_encoded

x_train.drop(['Embarked', 'Embarked_nan', 'Fare'], axis=1, inplace=True)


x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)
print(x_train.isnull().sum())

x_train['FamilySize'] = x_train['SibSp'] + x_train['Parch'] + 1  # +1 for self

# Bin into categories
x_train['FamilyType'] = pd.cut(x_train['FamilySize'], bins=[0, 1, 4, 11], labels=['Solo', 'Small', 'Large'])

family_dummies = pd.get_dummies(x_train['FamilyType'], prefix='Family')
x_train = pd.concat([x_train, family_dummies], axis=1)
x_train.drop(['Parch', 'SibSp', 'FamilySize', 'FamilyType'], axis=1, inplace=True)

print(x_train.columns)


Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
AgeGroup      0
FareGroup     0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64
Index(['Pclass', 'Sex', 'Age', 'AgeGroup', 'FareGroup', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Family_Solo', 'Family_Small',
       'Family_Large'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)


In [1]:
x_test = test_dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

x_test['Sex'] = x_test['Sex'].map({"male": 0, "female": 1})

x_test['AgeGroup'] = pd.cut(x_test['Age'], bins=age_bins, labels=age_labels)
x_test['AgeGroup'] = x_test['AgeGroup'].cat.codes

x_test['Fare'] = x_test['Fare'].fillna(x_test['Fare'].median())  # Fill missing with median
x_test['FareGroup'] = pd.cut(x_test['Fare'], bins=fare_bins, labels=fare_labels)

x_test['Embarked'] = x_test['Embarked'].fillna(x_test['Embarked'].mode()[0])
embarked_encoded_test = encoder.transform(x_test[['Embarked']])  # Use transform(), not fit_transform()
x_test[encoded_cols] = embarked_encoded_test
x_test.drop('Embarked', axis=1, inplace=True)

x_test['FamilySize'] = x_test['SibSp'] + x_test['Parch'] + 1  # +1 for self
x_test['FareGroup'] = x_test['FareGroup'].cat.codes

# Bin into categories
x_test['FamilyType'] = pd.cut(x_test['FamilySize'], bins=[0, 1, 4, 11], labels=['Solo', 'Small', 'Large'])
family_dummies = pd.get_dummies(x_test['FamilyType'], prefix='Family')
x_test = pd.concat([x_test, family_dummies], axis=1)

x_test.drop(['Parch', 'SibSp', 'FamilySize', 'FamilyType'], axis=1, inplace=True)

missing_cols = set(x_train.columns) - set(x_test.columns)
for col in missing_cols:
    x_test[col] = 0 

x_test = x_test[x_train.columns] 

x_test['Age'].fillna(x_train['Age'].mean(), inplace=True)




# print("\nTest Data Columns:")
print(x_test.columns)
# print("\nMissing Values in Test Data:")
# print(x_test.isnull().sum())


NameError: name 'test_dataset' is not defined

In [28]:
import tensorflow as tf

print(x_train.shape)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True
)

model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(11,)),
     tf.keras.layers.Dense(64, activation='relu'),   
    tf.keras.layers.Dense(32, activation='relu'),   
    tf.keras.layers.Dense(16, activation='relu'),   
    tf.keras.layers.Dense(1, activation='sigmoid') 
])

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train,
            epochs=100,
            batch_size=8,
            validation_split=0.2,     
            callbacks=[early_stop])


y_test = pd.read_csv('../Datasets/titanic/gender_submission.csv')


test_loss, test_acc = model.evaluate(x_test, y_test['Survived'])
print(f"Test accuracy: {test_acc:.4f}")

(891, 11)
Epoch 1/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4842 - loss: 0.9745 - val_accuracy: 0.6704 - val_loss: 0.5816
Epoch 2/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6670 - loss: 0.6037 - val_accuracy: 0.8380 - val_loss: 0.5329
Epoch 3/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7371 - loss: 0.5620 - val_accuracy: 0.8212 - val_loss: 0.4689
Epoch 4/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7117 - loss: 0.5667 - val_accuracy: 0.8659 - val_loss: 0.4200
Epoch 5/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7258 - loss: 0.5341 - val_accuracy: 0.8492 - val_loss: 0.3986
Epoch 6/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7980 - loss: 0.4812 - val_accuracy: 0.8212 - val_loss: 0.4665
Epoch 7/100
[1m89/89[0

In [88]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the decision tree
dt = DecisionTreeClassifier(
    max_depth=3,
    random_state=12,
    min_samples_split=10,
)

# Fit to training data
dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

acc = accuracy_score(y_test["Survived"], y_pred)
print(f"Decision Tree accuracy: {acc:.4f}")




Decision Tree accuracy: 0.9689
