In [2]:
def titanic_data_processing(data, add_polynomial=True, poly_degree=2):
    y_data = data["Survived"] if ("Survived" in data.columns) else None
    passengerId_array = data["PassengerId"]
  
    
    columns_remove = ["PassengerId", "Survived", "Name", "Ticket", "Fare", "Cabin", "Embarked"]
    if("Survived" in data.columns):
        columns_remove.append("Survived")

    data.replace({"Sex": {"male": 1, "female": 0}}, inplace = True)
    data = data.drop(columns = columns_remove, errors = "ignore")
    data["Age"].fillna(data["Age"].mean(), inplace=True)
    dataset = data.to_numpy()
    
    y = y_data.to_numpy() if y_data is not None else None
    x = dataset

    #sklearn stuff

    feature_names = data.columns
    if not add_polynomial:
        return x, y, passengerId_array, feature_names

    x_norm = z_score_normalization(x)
    #polynomial features
    poly = PolynomialFeatures(degree=poly_degree, include_bias=False)
    x_poly = poly.fit_transform(x_norm)
    
    poly_feature_names = poly.get_feature_names_out(feature_names)
    print(f"Original features ({len(feature_names)}): {feature_names.tolist()}")
    print(f"Polynomial features ({len(poly_feature_names)}): {poly_feature_names.tolist()}")
    
    return x_poly, y, passengerId_array, poly_feature_names
    

In [3]:
def z_score_normalization(x_data):
    
    mu = np.mean(x_data, axis = 0)
    sigma = np.std(x_data, axis = 0)
    x_norm = (x_data - mu) / sigma
    return x_norm


In [22]:
#data processing

filename = input("Please enter the training file name: ")
train_data = pd.read_csv(filename)
use_poly = True
poly_degree = 3
x_train, y_train, passengerID_train, feature_names = titanic_data_processing(
        train_data, add_polynomial=use_poly, poly_degree=poly_degree)
#passengerId_array = data["PassengerId"]
#y_data = data["Survived"]
#columns_remove = ["PassengerId", "Survived", "Name", "Ticket", "Fare", "Cabin", "Embarked"]
#data.replace({"Sex": {"male": 1, "female": 0}}, inplace = True)
#data = data.drop(columns = columns_remove)
#data.dropna(inplace=True)
#dataset = data.to_numpy()

#y_train = y_data.to_numpy()
#x_train = dataset

print("x: ", x_train[0:10, :])
print("y: ", y_train[0:10])

x_train = z_score_normalization(x_train)

print("Normalized x: ", x_train[0:10])


Please enter the training file name:  train.csv


Original features (5): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
Polynomial features (55): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Pclass^2', 'Pclass Sex', 'Pclass Age', 'Pclass SibSp', 'Pclass Parch', 'Sex^2', 'Sex Age', 'Sex SibSp', 'Sex Parch', 'Age^2', 'Age SibSp', 'Age Parch', 'SibSp^2', 'SibSp Parch', 'Parch^2', 'Pclass^3', 'Pclass^2 Sex', 'Pclass^2 Age', 'Pclass^2 SibSp', 'Pclass^2 Parch', 'Pclass Sex^2', 'Pclass Sex Age', 'Pclass Sex SibSp', 'Pclass Sex Parch', 'Pclass Age^2', 'Pclass Age SibSp', 'Pclass Age Parch', 'Pclass SibSp^2', 'Pclass SibSp Parch', 'Pclass Parch^2', 'Sex^3', 'Sex^2 Age', 'Sex^2 SibSp', 'Sex^2 Parch', 'Sex Age^2', 'Sex Age SibSp', 'Sex Age Parch', 'Sex SibSp^2', 'Sex SibSp Parch', 'Sex Parch^2', 'Age^3', 'Age^2 SibSp', 'Age^2 Parch', 'Age SibSp^2', 'Age SibSp Parch', 'Age Parch^2', 'SibSp^3', 'SibSp^2 Parch', 'SibSp Parch^2', 'Parch^3']
x:  [[ 8.27377244e-01  7.37695132e-01 -5.92480600e-01  4.32793366e-01
  -4.73673609e-01  6.84553104e-01  6.10352165

  data.replace({"Sex": {"male": 1, "female": 0}}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Age"].fillna(data["Age"].mean(), inplace=True)


In [5]:
#sigmoid function
def sigmoid(z):
    g = 1 / (1 + np.e ** -z)
    return g

In [6]:
def compute_cost(x, y, w, b, lambda_ = 1):
    #w is 1D vector
    #x = 2D
    #y = 1D
    #b = 1D
    
    m = x.shape[0]
    total_cost = 0
    loss_sum = 0
    regularize = 0
    for i in range(m):
        z_i = np.dot(w, x[i]) + b
        f_wb_i = sigmoid(z_i)
        loss = (-y[i] * np.log(f_wb_i)) - (1 - y[i]) * np.log(1 - f_wb_i)
        loss_sum += loss
    total_cost = loss_sum / m
    for i in range(m):
        reg = w ** 2
        regularize += reg
    regularize = (lambda_ / (2 * m)) * regularize
    total_cost += regularize
    return total_cost

In [7]:
def compute_gradient(x, y, w, b, lambda_ = 1):
    m = x.shape[0]
    dj_dw = 0
    dj_db = 0
    regularize = 0
    for i in range(m):
        z_i = np.dot(w, x[i]) + b
        f_wb_i = sigmoid(z_i)
        
        dj_dw_i = np.dot(f_wb_i - y[i], x[i])
        dj_db_i = (f_wb_i - y[i])
        
        dj_dw += dj_dw_i
        dj_db += dj_db_i
    dj_dw /= m
    dj_db /= m
    
    regularize = (lambda_ / m) * w
    dj_dw = dj_dw + regularize
    return dj_dw, dj_db
        

In [8]:
def gradient_descent(x, y, w, b, cost, gradient, alpha, num_iters, lambda_ = 1):
    interval = max(1, round(num_iters / 10))
    for i in range (num_iters):
        
        dj_dw, dj_db = gradient(x, y, w, b, lambda_)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        if(i % interval == 0 or i < 10 or i == num_iters - 1):
            cost_i = cost(x, y, w, b, lambda_)
        
            print(f"Iteration: {i}: Cost = {cost_i}")
        
    return w, b

In [60]:
#run gradient descent
np.random.seed(3)
initial_w = np.random.randn(x_train.shape[1])
print("Initial W: ", initial_w)
initial_b = 0.5
iterations = 50000
alpha = 0.001
#print(np.isnan(x_train).sum())  # Should be 0
#print(np.isnan(y_train).sum())

w, b = gradient_descent(x_train, y_train, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations, 1)

print("w: ", w)
print("b: ", b)

Initial W:  [ 1.78862847  0.43650985  0.09649747 -1.8634927  -0.2773882  -0.35475898
 -0.08274148 -0.62700068 -0.04381817 -0.47721803 -1.31386475  0.88462238
  0.88131804  1.70957306  0.05003364 -0.40467741 -0.54535995 -1.54647732
  0.98236743 -1.10106763 -1.18504653 -0.2056499   1.48614836  0.23671627
 -1.02378514 -0.7129932   0.62524497 -0.16051336 -0.76883635 -0.23003072
  0.74505627  1.97611078 -1.24412333 -0.62641691 -0.80376609 -2.41908317
 -0.92379202 -1.02387576  1.12397796 -0.13191423 -1.62328545  0.64667545
 -0.35627076 -1.74314104 -0.59664964 -0.58859438 -0.8738823   0.02971382
 -2.24825777 -0.26776186  1.01318344  0.85279784  1.1081875   1.11939066
  1.48754313]
Iteration: 0: Cost = [2.81761075 1.31343569 1.22288568 2.95437168 1.25668618 1.28114522
 1.221656   1.41477755 1.21919644 1.3321075  2.08113855 1.60942558
 1.60658325 2.67950286 1.21948793 1.30010694 1.36695183 2.41401748
 1.70073808 1.82441636 1.9205181  1.23941165 2.32244715 1.24627366
 1.74223056 1.47251051 1.413

In [61]:
def predict(x, w, b):

    z_wb = np.dot(x, w) + b
    f_wb = sigmoid(z_wb)
    p = np.round(f_wb)
    return p.astype(int)

In [62]:

test_filename = input("Please enter the test file name: ")
test_data = pd.read_csv(test_filename)

use_poly = True
poly_degree = 3
x_test, y_test, passengerId, ___ = titanic_data_processing(test_data, add_polynomial=use_poly, poly_degree=poly_degree)
x_test = z_score_normalization(x_test)

results = predict(x_test, w, b)
print(results)

Please enter the test file name:  test.csv


Original features (5): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
Polynomial features (55): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Pclass^2', 'Pclass Sex', 'Pclass Age', 'Pclass SibSp', 'Pclass Parch', 'Sex^2', 'Sex Age', 'Sex SibSp', 'Sex Parch', 'Age^2', 'Age SibSp', 'Age Parch', 'SibSp^2', 'SibSp Parch', 'Parch^2', 'Pclass^3', 'Pclass^2 Sex', 'Pclass^2 Age', 'Pclass^2 SibSp', 'Pclass^2 Parch', 'Pclass Sex^2', 'Pclass Sex Age', 'Pclass Sex SibSp', 'Pclass Sex Parch', 'Pclass Age^2', 'Pclass Age SibSp', 'Pclass Age Parch', 'Pclass SibSp^2', 'Pclass SibSp Parch', 'Pclass Parch^2', 'Sex^3', 'Sex^2 Age', 'Sex^2 SibSp', 'Sex^2 Parch', 'Sex Age^2', 'Sex Age SibSp', 'Sex Age Parch', 'Sex SibSp^2', 'Sex SibSp Parch', 'Sex Parch^2', 'Age^3', 'Age^2 SibSp', 'Age^2 Parch', 'Age SibSp^2', 'Age SibSp Parch', 'Age Parch^2', 'SibSp^3', 'SibSp^2 Parch', 'SibSp Parch^2', 'Parch^3']
[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0

  data.replace({"Sex": {"male": 1, "female": 0}}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Age"].fillna(data["Age"].mean(), inplace=True)


In [63]:
results = results.flatten()
results = pd.DataFrame({"PassengerId": passengerId, 
                        "Survived": results})
results.to_csv("Results.csv", index = False)
with open("Results.txt", "w") as file:
    for i in range(x_test.shape[0]):
        passenger_result = f"{passengerId[i]}, {results.iloc[i]['Survived']}\n"
        file.write(passenger_result)

In [64]:
#not my code: accuracy prediction:

def evaluate_model(y_true, y_pred):
    """
    Evaluate model performance with various metrics
    """
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Calculate precision, recall, f1-score
    report = classification_report(y_true, y_pred)
    
    print(f"\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"\nConfusion Matrix:")
    print(conf_matrix)
    print(f"\nClassification Report:")
    print(report)
    
    return accuracy

In [65]:
 print("\nMaking predictions...")
 train_predictions = predict(x_train, w, b)
 train_accuracy = evaluate_model(y_train, train_predictions)
 print(train_accuracy)


Making predictions...

Model Evaluation:
Accuracy: 0.8215 (82.15%)

Confusion Matrix:
[[500  49]
 [110 232]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       549
           1       0.83      0.68      0.74       342

    accuracy                           0.82       891
   macro avg       0.82      0.79      0.80       891
weighted avg       0.82      0.82      0.82       891

0.8215488215488216
