In [25]:
import pandas as pd
import numpy as np
import copy
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [26]:
training_raw_data = pd.read_csv('/Users/kshitiztiwari/ML_Practice/Titanic_NN/spaceship-titanic/train.csv')


In [27]:
training_raw_data.shape

(8693, 14)

In [28]:
training_raw_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [29]:
target_data = training_raw_data['Transported']

In [30]:
training_raw_data = training_raw_data.drop(columns = ['Transported', 'Name', 'PassengerId', 'Cabin'])


In [31]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
training_raw_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [33]:
training_raw_data.shape

(8693, 10)

In [34]:
missing_percentage = (training_raw_data.isnull().sum() / len(training_raw_data))*100
missing_percentage

HomePlanet      2.312205
CryoSleep       2.496261
Destination     2.093639
Age             2.059128
VIP             2.335212
RoomService     2.082135
FoodCourt       2.105142
ShoppingMall    2.392730
Spa             2.105142
VRDeck          2.162660
dtype: float64

In [35]:
## missing Percentages are very low so can be dealt with easily

In [36]:
X_train, X_test, y_train,y_test = train_test_split(training_raw_data, target_data, test_size = 0.2, random_state = 42)

In [37]:
num_cols = list(training_raw_data.select_dtypes(include = 'number').columns)

In [38]:
cat_cols = list(training_raw_data.select_dtypes(include = 'object').columns)
cat_cols

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In [39]:
## Lets decide on the performance metrics based on Y column.

In [40]:
target_data.value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

In [41]:
## As training data is not skewed we can use Accuracy as the evaluation metrics in this case.

In [42]:
numerical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())])

categorical_pipeline = Pipeline(steps = [
    ('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown ='ignore'))])

preprocess = ColumnTransformer(transformers = [
    ('num', numerical_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

# model_pipeline = Pipeline(steps = [
#     ('preprocess',preprocess),
#     ('model',LogisticRegressionCV())
# ])  

preprocess_pipeline = Pipeline( steps = [
    ('preprocess', preprocess)
])

In [43]:
preprocess_pipeline.fit(X_train)
# processed_X_train = preprocess_pipeline.transform(X_train)
# processed_X_test = preprocess_pipeline.transform(X_test)
# processed_X_test.shape

In [729]:
model_pipeline.fit(X_train,y_train)

In [730]:
x_test_pred = model_pipeline.predict(X_test)

In [732]:
accuracy = accuracy_score(y_test,x_test_pred)

In [734]:
print(x_test_pred)

[False  True  True ... False  True False]


In [733]:
print(accuracy)

0.777458309373203


## Let's do this with 1 hidden layered NN and let's check the difference.

In [639]:
X_train.shape
y_train.shape

(6954,)

In [640]:
X_train_new = processed_X_train.T
X_test_new = processed_X_test.T
y_train_reshaped = y_train.to_numpy().reshape(-1, 1)
y_train_renew = y_train_reshaped.T

print(X_train_new.shape)
print(y_train_renew.shape)

(16, 6954)
(1, 6954)


In [641]:
def layer_sizes(X, Y):

    n_x = X.shape[0]
    n_h = 20
    n_y = Y.shape[0]

    return n_x, n_h, n_y

In [642]:
def initialize_parameters(n_x,n_h,n_y):
    W1 = np.random.randn(n_h,n_x) * 0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h) * 0.01
    b2 = np.zeros((n_y,1))

    parameters = { 'W1' : W1,
                 'b1' : b1,
                 'W2' : W2,
                 'b2' : b2}
    return parameters

In [643]:
def sigmoid(z):
    value = 1 /(1 + np.exp(-z))
    return value

In [644]:
def forward_propagation(X, parameters):

    # extract initialized parameters first
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']

    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)

    cache = {'Z1': Z1,
             'A1' : A1,
             'Z2' : Z2,
             'A2' : A2}

    return A2, cache 

    
    

In [645]:
def compute_cost(A2, Y):

    m = Y.shape[1]

    logprobs = Y * np.log(A2) + (1 - Y) * np.log(1 - A2)
    cost = -np.sum(logprobs) / m

    cost = float(np.squeeze(cost)) 
    
    return cost
    

In [646]:
def backward_propagation(parameters, cache, X, Y):

    m = X.shape[1]

    # Extract parameters first
    
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']

    # extract forward prop paramteres from cache

    Z1 = cache['Z1']
    A1 = cache['A1']
    Z2 = cache['Z2']
    A2 = cache['A2']

    # Implement backward Prop

    dZ2 = A2 - Y
    dW2 = np.dot(dZ2,A1.T) / m
    db2 = np.sum(dZ2, axis = 1, keepdims = True) / m
    dZ1 = np.dot(W2.T,dZ2) * (1 - np.power(A1, 2))
    dW1 = np.dot(dZ1, X.T)
    db1 =  np.sum(dZ1, axis = 1, keepdims = True) / m

    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    
    return grads


In [777]:
def update_parameters(parameters, grads, learning_rate = 1.2):
    W1 = copy.deepcopy(parameters['W1'])
    b1 = parameters['b1']
    W2 = copy.deepcopy(parameters['W2'])
    b2 = parameters['b2']
    
    # YOUR CODE ENDS HERE
    
    # Retrieve each gradient from the dictionary "grads"
    #(≈ 4 lines of code)
    
    # YOUR CODE STARTS HERE
    dW1 = grads['dW1']
    dW2 = grads['dW2']
    db1 = grads['db1']
    db2 = grads['db2']
    # YOUR CODE ENDS HERE
    
    # Update rule for each parameter
    #(≈ 4 lines of code)
    
    # YOUR CODE STARTS HERE
    W1 = W1 - (learning_rate * dW1)
    b1 = b1 - (learning_rate * db1)
    W2 = W2 - (learning_rate * dW2)
    b2 = b2 - (learning_rate * db2)
    
    # YOUR CODE ENDS HERE
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [778]:
def nn_model(X, Y, n_h, num_iterations = 10000, print_cost=False):
    """
    Arguments:
    X -- dataset of shape (16f, number of examples)
    Y -- labels of shape (1, number of examples)
    n_h -- size of the hidden layer
    num_iterations -- Number of iterations in gradient descent loop
    print_cost -- if True, print the cost every 1000 iterations
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    
    np.random.seed(3)
    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[2]
    
    # Initialize parameters
    #(≈ 1 line of code)
   
    # YOUR CODE STARTS HERE
    parameters = initialize_parameters(n_x,n_h,n_y)
    
    # YOUR CODE ENDS HERE
    
    # Loop (gradient descent)

    for i in range(0, num_iterations):
         
        
        #(≈ 4 lines of code)
        # Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache".
        A2, cache = forward_propagation(X,parameters)
        
        # Cost function. Inputs: "A2, Y". Outputs: "cost".
        cost = compute_cost(A2,Y)
 
        # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        grads = backward_propagation(parameters,cache,X,Y)
 
        # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters = update_parameters(parameters,grads)
        
       
        
        
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
        
        # Print the cost every 1000 iterations
        if print_cost and i % 1000 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    return parameters

In [780]:
def predict(parameters, X):
    """
    Using the learned parameters, predicts a class for each example in X
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    #(≈ 2 lines of code)
    
    # YOUR CODE STARTS HERE
    A2, _ = forward_propagation(X, parameters)
    predictions = (A2 > 0.5)
    
    # YOUR CODE ENDS HERE
    
    return predictions

In [781]:
parameters = nn_model(X_train_new, y_train_renew, n_h = 20, num_iterations = 10000, print_cost=True)

Cost after iteration 0: 0.693060
Cost after iteration 1000: 0.418764
Cost after iteration 2000: 0.413296
Cost after iteration 3000: 0.410476
Cost after iteration 4000: 0.411529
Cost after iteration 5000: 0.408538
Cost after iteration 6000: 0.407899
Cost after iteration 7000: 0.407580
Cost after iteration 8000: 0.407907
Cost after iteration 9000: 0.406933


In [782]:
prediction = predict(parameters,X_test_new)

In [783]:
prediction = prediction.reshape(-1)
prediction.shape

(1739,)

In [784]:
y_test_reshaped = y_test.to_numpy().reshape(-1, 1)
y_test_renew = y_test_reshaped.T

In [785]:
y_test_renew = y_test_renew.reshape(-1)
y_test_renew.shape

(1739,)

In [786]:
accuracy = accuracy_score(y_test_renew, prediction)

In [787]:
print(accuracy)

0.7786083956296722
