In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('Titanic-Dataset.csv')

data.isnull().sum()

most_common_embarked = data['Embarked'].mode()[0]
data['Embarked'].fillna(most_common_embarked, inplace=True)

data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

X = data.drop('Survived', axis=1)
y = data['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=35)

def sigmoid(a):
    return 1 / (1 + np.exp(-a))

def logistic(x_train, x_test, learning_rate=0.05, epochs=1500):
    c, d = x_train.shape
    theta = np.zeros(d)

    for epoch in range(epochs):
        t = np.dot(x_train, theta)
        n = sigmoid(t)
        gradient = np.dot(x_train.T, (n - y_train)) / c
        theta -= learning_rate * gradient

    return theta

theta = logistic(x_train.values, y_train.values)
y_pred = np.round(sigmoid(np.dot(x_test.values, theta)))

svm_classifier = SVC(kernel="linear")
svm_classifier.fit(x_train, y_train)
svm_pred = svm_classifier.predict(x_test)

def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probability = counts / len(y)
    return -np.sum(probability * np.log2(probability))

def info(x_col, y, threshold):
    left = x_col < threshold
    right = ~left
    left_entropy = entropy(y[left])
    right_entropy = entropy(y[right])
    p_en = entropy(y)
    return p_en - (len(y[left]) / len(y) * left_entropy + len(y[right]) / len(y) * right_entropy)

best_gain = 0
best_feature =0
best_threshold =0

for feature in range(x_train.shape[1]):
    thresholds = np.unique(x_train.values[:, feature])
    for threshold in thresholds:
        gain = info(x_train.values[:, feature], y_train.values, threshold)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
            best_threshold = threshold

tree_pred = np.zeros_like(y_test)
tree_pred[x_test.values[:, best_feature] < best_threshold] = 0
tree_pred[x_test.values[:, best_feature] >= best_threshold] = 1

random_data = pd.DataFrame({
    'Pclass': np.random.randint(1, 4, size=10),
    'Sex': np.random.randint(0, 2, size=10),
    'Age': np.random.uniform(0, 80, size=10),
    'SibSp': np.random.randint(0, 5, size=10),
    'Parch': np.random.randint(0, 5, size=10),
    'Fare': np.random.uniform(0, 300, size=10),
    'Embarked': np.random.randint(0, 3, size=10)
})

# Update theta to match the number of features (including the new feature)
theta = np.zeros(8)

random_data['NewFeature'] = np.random.uniform(0, 1, size=10)

random_pred_lr = np.round(sigmoid(np.dot(random_data.values, theta)))

# SVM prediction on random data
random_pred_svm = svm_classifier.predict(random_data)

# Retraining decision tree after adding the new feature
X_random = random_data.drop(columns=['NewFeature'])
y_random = np.zeros(len(random_data))  # Placeholder, since we are generating random data

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_random, y_random)

# Remove 'NewFeature' column before making predictions
random_data.drop(columns=['NewFeature'], inplace=True)
random_pred_tree = tree_classifier.predict(random_data)

print("Logistic Regression predictions on random data:", random_pred_lr)
print("SVM predictions on random data:", random_pred_svm)
print("Decision Tree predictions on random data:", random_pred_tree)


  return 1 / (1 + np.exp(-a))


Logistic Regression predictions on random data: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
SVM predictions on random data: [1 1 1 1 1 1 1 1 1 1]
Decision Tree predictions on random data: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Feature names unseen at fit time:
- NewFeature
Feature names seen at fit time, yet now missing:
- PassengerId

