# From Scratch

### Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np

# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_df = pd.read_csv(url)

# Display the first few rows of the dataset
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Basic Data Cleaning and Preprocessing

In [2]:
# Fill missing values for 'Age' with the median value
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)

# Fill missing values for 'Embarked' with the mode value
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column as it has too many missing values
titanic_df.drop(columns=['Cabin'], inplace=True)

# Convert categorical variables to numeric
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'], drop_first=True)

# Drop columns that won't be used as features
titanic_df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Display the first few rows of the cleaned dataset
titanic_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,True,False,True
1,1,1,38.0,1,0,71.2833,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True
3,1,1,35.0,1,0,53.1,False,False,True
4,0,3,35.0,0,0,8.05,True,False,True


### Define Features (X) and Target Variable (y)

In [3]:
# Define the target variable 'y'
y = titanic_df['Survived'].values

# Define the feature matrix 'X'
X = titanic_df.drop(columns=['Survived']).values

# Display the shapes of X and y
print(X.shape, y.shape)

(891, 8) (891,)


### Data Standardization

In [4]:
# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert X (features) to NumPy array for efficient calculations
X = np.array(X)
y = np.array(y)

### Logistic Regression

In [5]:
class LogisticRegression:
    def __init__(self, learning_rate=0.05, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        # Initialize weights and bias
        self.weights = np.zeros(X.shape[1])
        self.bias = 0
        m = X.shape[0]
        
        for _ in range(self.iterations):
            # Linear model
            z = np.dot(X, self.weights) + self.bias
            # Apply sigmoid function
            h = self.sigmoid(z)
            # Compute gradients
            d_weights = (1 / m) * np.dot(X.T, (h - y))
            d_bias = (1 / m) * np.sum(h - y)
            # Update weights and bias
            self.weights -= self.learning_rate * d_weights
            self.bias -= self.learning_rate * d_bias
    
    def predict(self, X):
        # Linear model
        z = np.dot(X, self.weights) + self.bias
        # Apply sigmoid function
        h = self.sigmoid(z)
        # Convert probabilities to binary predictions
        return np.where(h >= 0.5, 1, 0)

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X[:int(0.9*len(X))], y[:int(0.9*len(X))])

### Prediction

In [6]:
# Make predictions
predictions = model.predict(X[int(0.9*len(X)):])

# Evaluate the model
accuracy = np.mean(predictions == y[int(0.9*len(X)):])
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 83.33%


# Using Scikit-Learn

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()

# Train the model
model.fit(X[:int(0.9*len(X))], y[:int(0.9*len(X))])
# Make predictions on the test set
y_pred = model.predict(X[int(0.9*len(X)):])

# Evaluate the model
accuracy = accuracy_score(y[int(0.9*len(X)):], y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 83.33%
