In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

# Load the dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_test_data = [train, test]  # combine train and test data
for dataset in train_test_data:
  dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mile": 3, "Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona": 3, "Mme": 3, "Capt": 3, "Sir": 3, }

for dataset in train_test_data:
  dataset['Title'] = dataset['Title'].map(title_mapping)

train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

for dataset in train_test_data:
  dataset.loc[ dataset["Age"] <= 16, "Age"] = 0
  dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1
  dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 40), 'Age'] = 2
  dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 60), 'Age'] = 3
  dataset.loc[ dataset['Age'] > 60, 'Age'] = 4

for dataset in train_test_data:
  dataset["Embarked"] = dataset["Embarked"].fillna('S')

train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

for dataset in train_test_data:
  dataset.loc[ dataset["Fare"] <= 17, "Fare"] = 0
  dataset.loc[(dataset["Fare"] > 17) & (dataset["Fare"] <= 30), "Fare"] = 1
  dataset.loc[(dataset["Fare"] > 30) & (dataset["Fare"] <= 100), "Fare"] = 2
  dataset.loc[ dataset["Fare"] > 100, "Fare"] = 3

cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
for dataset in train_test_data:
  dataset["Cabin"] = dataset["Cabin"].map(cabin_mapping)

train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for dataset in train_test_data:
  dataset["FamilySize"] = dataset["FamilySize"].map(family_mapping)

features_drop = ["Ticket", "SibSp", "Parch"]
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(["PassengerId"], axis=1)

train_data = train.drop("Survived", axis=1)
target = train["Survived"]

train_data.shape, target.shape

# Fill missing values
# Data preprocessing function
def preprocess_data(df):


    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


    features = ['Pclass', 'Sex', 'Title', 'Age', 'Embarked', 'Cabin', 'FamilySize', 'Fare']
    return df[features]


X = preprocess_data(train)
y = train['Survived']
X_test = preprocess_data(test)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


# kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=113)


# fold_accuracies = []


# for fold, (train_index, val_index) in enumerate(kf.split(X_scaled, y)):
#     print(f"\nTraining fold {fold + 1}...")


#     X_train, X_val = X_scaled[train_index], X_scaled[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]


#     final_model = Sequential()
#     final_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
#     final_model.add(Dropout(0.5))
#     final_model.add(Dense(32, activation='relu'))
#     final_model.add(Dropout(0.3))
#     final_model.add(Dense(1, activation='sigmoid'))  # Binary classification, so use sigmoid


#     lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
#     early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


#     final_model.compile(optimizer=Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])


#     final_model.fit(X_scaled, y, epochs=50, batch_size=64, validation_split=0.2, callbacks=[early_stopping, lr_scheduler], verbose=1)


#     val_loss, val_accuracy = final_model.evaluate(X_val, y_val, verbose=0)
#     fold_accuracies.append(val_accuracy)
#     print(f"Validation Accuracy for fold {fold + 1}: {val_accuracy:.4f}")

# #Compute the average accuracy over all folds
# avg_accuracy = np.mean(fold_accuracies)
# print(f"\nAverage Cross-Validation Accuracy: {avg_accuracy:.4f}")

# # Make predictions on the test set using the last trained model (or you can train a final model on all data)
final_model = Sequential()
final_model.add(Dense(64, input_dim=X_scaled.shape[1], activation='relu'))
final_model.add(Dense(32, activation='relu'))
final_model.add(Dense(1, activation='sigmoid'))

final_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
final_model.fit(X_scaled, y, epochs=50, batch_size=64, validation_split=0.2, verbose=0)

# Make predictions on the test set
test_pred = (final_model.predict(X_test_scaled) > 0.5).astype("int32")

# Create a submission file
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred.flatten()
})
submission.to_csv('submission_tf_cv8.csv', index=False)
print('\nSubmission file has been created.')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)
The behavior will change in pandas 3.0. This inplace method will

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Submission file has been created.
