# Neural Network Notebook
Main questions being answered:
1. Can the age of death based on features like gender, occupation, birth year, and associated country be predicted?
2. Can the nn accurately classify the manner of death (e.g., natural causes, accidents) based on the available features?
3. Can any non-linear relationships be discerned between occupation type and age of death or manner of death?
4. Can any outlier cases be identified / predicted, such as individuals who lived significantly longer or shorter than the average life expectancy of their associated country?
5. Can the life expectancy of a country based on the aggregate data of individuals associated with that country be predicted?
6. How well does the neural network generalize its predictions to countries or occupations that are underrepresented in the dataset?

In [32]:
# Load relevant packages
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

Read in the preprocessed age data from the csv file.

In [3]:
# Load the data
data = pd.read_csv('preprocessed_age.csv')

### Neural Network Implementation #1  
Code for the Neural Network using Keras and Tensorflow. It is a simple feedforward network with 2 hidden layers.

In [34]:
def run_neural_network(data, features, target, epochs=100, test_size=0.2, random_state=42):
    """
    Define, compile, train, and evaluate a neural network model.
    
    Parameters:
        data (DataFrame): The dataset.
        features (list): List of feature column names.
        target (str): Target column name.
        epochs (int): Number of training epochs. Default is 100.
        test_size (float): Proportion of data to be used for testing. Default is 0.2.
        random_state (int): Random state for train-test split. Default is 42.
    
    Returns:
        model: Trained neural network model.
        history: Training history of the model.
    """
    # Data Preparation
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns.tolist()),
            ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns.tolist())
        ])
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Convert the preprocessed data to dense format and then to tf.data.Dataset format
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_preprocessed.todense(), y_train)).batch(32)
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test_preprocessed.todense(), y_test)).batch(32)

    
    # Define the model
    model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=(1633,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    
    # Check input data shape
    print(f'Training data shape: {X_train.shape}')
    print(f'Test data shape: {X_test.shape}')

    # Check for NaN or Inf values
    print(f'NaN values in training data: {X_train.isna().sum().sum()}')
    print(f'NaN values in test data: {X_test.isna().sum().sum()}')
    print(f'Inf values in training data: {(X_train == np.inf).sum().sum()}')
    print(f'Inf values in test data: {(X_test == np.inf).sum().sum()}')

    # Display model architecture
    model.summary()
    
    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    # Train the model using tf.data.Dataset format
    history = model.fit(
        train_dataset,
        validation_data=test_dataset,
        epochs=epochs
    )
    
    # Evaluate the model
    loss, mae = model.evaluate(test_dataset)
    print(f'Mean Absolute Error on test data: {mae}')
    
    # Visualize the training process
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Mean Squared Error')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.xlabel('Epoch')
    plt.ylabel('Mean Absolute Error')
    plt.legend()

    plt.tight_layout()
    plt.show()
    
    return model, history

### Neural Network Implementation #2


#### Can the age of death based on features like gender, occupation, birth year, and associated country be predicted?

##### Using implmentation #1

In [35]:
data['Gender_encoded'] = pd.factorize(data['Gender'])[0]
data['Occupation_encoded'] = pd.factorize(data['Occupation'])[0]

features = ['Gender_encoded', 'Occupation_encoded', 'Birth year', 'Associated Countries']
target = 'Age of death'
model, history = run_neural_network(data, features, target)

Training data shape: (796829, 4)
Test data shape: (199208, 4)
NaN values in training data: 202485
NaN values in test data: 51020
Inf values in training data: 0
Inf values in test data: 0
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 64)                104576    
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 1)                 33        
                                                                 
Total params: 106689 (416.75 KB)
Trainable params: 106689 (416.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8

##### Using implementation #2

19


##### Using implementation #3