Initial Model - Kidd

In [4]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

# Import our input dataset
import pandas as pd
kidd_df = pd.read_csv("../Resources/kidd_combined.csv")
kidd_df.tail(5)

Unnamed: 0,id,rs3737576,rs7554936,rs2814778,rs798443,rs1876482,rs1834619,rs3827760,rs260690,rs6754311,...,rs11652805,rs2042762,rs7226659,rs3916235,rs4891825,rs7251928,rs310644,rs2024566,gender,superpopulation
2499,902,TT,CT,TT,AA,GA,AG,GG,CC,CC,...,CT,TT,TG,CC,AA,AA,TT,AA,male,EAS
2500,485,TT,TT,CT,AA,GA,AA,AA,CA,CC,...,TT,TT,GG,CC,AA,AC,CT,GG,female,AMR
2501,1064,TT,TT,TT,GA,AA,GG,AA,CA,TT,...,TT,TT,GG,CC,AA,CC,TT,AG,female,SAS
2502,1999,TT,CT,TT,AG,AA,AA,AG,CC,CC,...,TT,TT,TT,CC,AA,AC,TT,AA,female,EAS
2503,1416,TT,TT,TT,GA,GA,GA,AA,AA,CC,...,CT,TT,GG,CC,AA,AA,CC,AA,female,SAS


In [5]:
# Drop categorical data not required
kidd_df = kidd_df.drop(columns=['id', 'gender'])
kidd_df.tail()

Unnamed: 0,rs3737576,rs7554936,rs2814778,rs798443,rs1876482,rs1834619,rs3827760,rs260690,rs6754311,rs10497191,...,rs4471745,rs11652805,rs2042762,rs7226659,rs3916235,rs4891825,rs7251928,rs310644,rs2024566,superpopulation
2499,TT,CT,TT,AA,GA,AG,GG,CC,CC,CC,...,GG,CT,TT,TG,CC,AA,AA,TT,AA,EAS
2500,TT,TT,CT,AA,GA,AA,AA,CA,CC,TC,...,GG,TT,TT,GG,CC,AA,AC,CT,GG,AMR
2501,TT,TT,TT,GA,AA,GG,AA,CA,TT,CC,...,GG,TT,TT,GG,CC,AA,CC,TT,AG,SAS
2502,TT,CT,TT,AG,AA,AA,AG,CC,CC,CC,...,GG,TT,TT,TT,CC,AA,AC,TT,AA,EAS
2503,TT,TT,TT,GA,GA,GA,AA,AA,CC,CC,...,GG,CT,TT,GG,CC,AA,AA,CC,AA,SAS


In [6]:
# Fill NaN with 6
kidd_df.superpopulation = kidd_df.superpopulation.fillna(5)
kidd_df.head()

Unnamed: 0,rs3737576,rs7554936,rs2814778,rs798443,rs1876482,rs1834619,rs3827760,rs260690,rs6754311,rs10497191,...,rs4471745,rs11652805,rs2042762,rs7226659,rs3916235,rs4891825,rs7251928,rs310644,rs2024566,superpopulation
0,CT,TT,TT,GG,AA,AA,AA,AA,CT,CT,...,GG,TC,TT,GT,CC,AA,AA,TT,AA,5
1,TT,CC,CC,GG,GG,GG,AA,AC,CC,TT,...,,CC,TT,GG,TT,GG,CA,CC,AA,5
2,TT,CC,CC,GG,GG,GG,AA,CA,CC,TC,...,GG,CC,TT,GG,TT,,AC,CC,AA,5
3,TT,CC,CC,GG,GG,GG,AA,CA,CC,TT,...,GG,CC,TT,GG,TT,GG,CC,CT,AA,5
4,TT,CC,CC,GG,GG,GG,AA,AC,CC,TT,...,GG,CC,TT,GG,TT,GG,CC,CC,AA,5


In [7]:
# Replace superpopulation alpha label with numeric value
# Reference: https://www.geeksforgeeks.org/how-to-replace-values-in-column-based-on-condition-in-pandas/

kidd_df.loc[ kidd_df["superpopulation"] == "AFR", "superpopulation"] = 0
kidd_df.loc[ kidd_df["superpopulation"] == "AMR", "superpopulation"] = 1
kidd_df.loc[ kidd_df["superpopulation"] == "EAS", "superpopulation"] = 2
kidd_df.loc[ kidd_df["superpopulation"] == "EUR", "superpopulation"] = 3
kidd_df.loc[ kidd_df["superpopulation"] == "SAS", "superpopulation"] = 4
kidd_df.tail(50)

Unnamed: 0,rs3737576,rs7554936,rs2814778,rs798443,rs1876482,rs1834619,rs3827760,rs260690,rs6754311,rs10497191,...,rs4471745,rs11652805,rs2042762,rs7226659,rs3916235,rs4891825,rs7251928,rs310644,rs2024566,superpopulation
2454,TT,CC,CC,GG,GG,GG,AA,AA,CC,TT,...,GG,CC,TT,GG,TT,GG,CC,CC,AA,0
2455,CC,TC,TT,AA,GG,GG,AG,AC,TC,CC,...,GG,TT,TT,GG,CC,AG,AA,TC,GG,1
2456,TT,TT,TT,AG,GG,GG,GA,CC,CT,TC,...,GG,CT,TT,GG,CC,AA,AA,TC,AG,1
2457,TT,TT,TT,AA,AA,AA,GG,CC,CC,CC,...,GG,TT,TT,TG,CC,AA,AA,TT,AA,2
2458,TT,TT,TT,AA,GG,AG,AA,AA,CC,CC,...,GG,TC,TT,GG,CC,AA,AC,TT,GA,4
2459,TT,TT,TT,AA,AG,AG,GG,CC,CC,CC,...,AG,TT,TT,GG,CC,AA,AA,TT,AA,2
2460,TT,CC,CC,GG,GG,GG,AA,AC,CC,TT,...,GG,CC,TT,GG,TT,GG,CC,TC,AA,0
2461,TT,CC,CC,GG,GG,GG,AA,CA,TT,TT,...,GA,TC,TT,GG,CT,AG,CA,CT,AA,0
2462,TC,TT,TT,AG,AA,AA,GG,CC,CC,CC,...,GG,TT,TT,TG,CC,AA,AA,TT,AA,2
2463,TT,TT,TT,AA,AG,GG,AA,CA,TT,CC,...,GG,TC,TT,GG,CC,AA,AC,TT,AG,3


In [8]:
# Set superpopulation to data type 'integer'
# Reference: https://saturncloud.io/blog/how-to-replace-strings-with-numbers-in-python-pandas-dataframe/
kidd_df['superpopulation'] = kidd_df['superpopulation'].astype('int')

In [9]:
# Generate our categorical variable lists
categorical_var_list = kidd_df.dtypes[kidd_df.dtypes == "object"].index.tolist()
categorical_var_list

['rs3737576',
 'rs7554936',
 'rs2814778',
 'rs798443',
 'rs1876482',
 'rs1834619',
 'rs3827760',
 'rs260690',
 'rs6754311',
 'rs10497191',
 'rs12498138',
 'rs4833103',
 'rs1229984',
 'rs3811801',
 'rs7657799',
 'rs870347',
 'rs16891982',
 'rs7722456',
 'rs192655',
 'rs3823159',
 'rs917115',
 'rs1462906',
 'rs6990312',
 'rs2196051',
 'rs1871534',
 'rs3814134',
 'rs4918664',
 'rs174570',
 'rs1079597',
 'rs2238151',
 'rs671',
 'rs7997709',
 'rs1572018',
 'rs2166624',
 'rs7326934',
 'rs9522149',
 'rs200354',
 'rs1800414',
 'rs12913832',
 'rs12439433',
 'rs735480',
 'rs1426654',
 'rs459920',
 'rs4411548',
 'rs2593595',
 'rs17642714',
 'rs4471745',
 'rs11652805',
 'rs2042762',
 'rs7226659',
 'rs3916235',
 'rs4891825',
 'rs7251928',
 'rs310644',
 'rs2024566']

In [10]:
# Check the number of unique values in each column
kidd_df[categorical_var_list].nunique()

rs3737576     4
rs7554936     4
rs2814778     4
rs798443      4
rs1876482     4
rs1834619     4
rs3827760     4
rs260690      4
rs6754311     4
rs10497191    4
rs12498138    4
rs4833103     4
rs1229984     4
rs3811801     4
rs7657799     4
rs870347      4
rs16891982    4
rs7722456     4
rs192655      4
rs3823159     4
rs917115      4
rs1462906     4
rs6990312     4
rs2196051     4
rs1871534     4
rs3814134     4
rs4918664     4
rs174570      4
rs1079597     4
rs2238151     4
rs671         4
rs7997709     4
rs1572018     4
rs2166624     4
rs7326934     4
rs9522149     4
rs200354      4
rs1800414     4
rs12913832    4
rs12439433    4
rs735480      4
rs1426654     4
rs459920      4
rs4411548     4
rs2593595     4
rs17642714    4
rs4471745     4
rs11652805    4
rs2042762     4
rs7226659     4
rs3916235     4
rs4891825     4
rs7251928     4
rs310644      4
rs2024566     4
dtype: int64

In [11]:
# Reference: https://stackoverflow.com/questions/63189787/typeerror-init-got-an-unexpected-keyword-argument-sparse   for issue with 'sparse'
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

# Create a OneHotEncoder instance
enc = OneHotEncoder (sparse_output=False)

# Fit and transform the OneHotEncoder using the categorical variable list
snp_encode_df = pd.DataFrame(enc.fit_transform(kidd_df[categorical_var_list]))

# Reference:https://stackoverflow.com/questions/58756515/onehotencoder-object-has-no-attribute-get-feature-names
# Add the encoded variable names to the dataframe
snp_encode_df.columns = enc.get_feature_names_out(categorical_var_list)
snp_encode_df.head()

Unnamed: 0,rs3737576_CC,rs3737576_CT,rs3737576_TC,rs3737576_TT,rs3737576_nan,rs7554936_CC,rs7554936_CT,rs7554936_TC,rs7554936_TT,rs7554936_nan,...,rs310644_CC,rs310644_CT,rs310644_TC,rs310644_TT,rs310644_nan,rs2024566_AA,rs2024566_AG,rs2024566_GA,rs2024566_GG,rs2024566_nan
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
# Merge one-hot encoded features and drop the originals
encoded_kidd_df = kidd_df.merge(snp_encode_df,left_index=True, right_index=True)
encoded_kidd_df  = encoded_kidd_df.drop(categorical_var_list, axis=1)
encoded_kidd_df.head()

Unnamed: 0,superpopulation,rs3737576_CC,rs3737576_CT,rs3737576_TC,rs3737576_TT,rs3737576_nan,rs7554936_CC,rs7554936_CT,rs7554936_TC,rs7554936_TT,...,rs310644_CC,rs310644_CT,rs310644_TC,rs310644_TT,rs310644_nan,rs2024566_AA,rs2024566_AG,rs2024566_GA,rs2024566_GG,rs2024566_nan
0,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
# Split our preprocessed data into our features and target arrays

y = encoded_kidd_df["superpopulation"].values
X = encoded_kidd_df.drop(columns=['superpopulation'])


# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Define the model - deep neural net
number_input_features = 275
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=6, activation="softmax"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Reference: https://stackoverflow.com/questions/63527580/tensorflow-with-keras-sparse-categorical-crossentropy
# Compile the model
nn.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [17]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2355 - loss: 1.8079
Epoch 2/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4523 - loss: 1.4894
Epoch 3/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5018 - loss: 1.3045
Epoch 4/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5542 - loss: 1.1433
Epoch 5/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6176 - loss: 0.9583
Epoch 6/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6664 - loss: 0.8265
Epoch 7/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7071 - loss: 0.7410
Epoch 8/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7433 - loss: 0.6371
Epoch 9/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━

In [18]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

20/20 - 0s - 11ms/step - accuracy: 0.8466 - loss: 0.9908
Loss: 0.9908329248428345, Accuracy: 0.8466453552246094


Optimised Model - Kidd

In [19]:
# Install dependencies
import keras_tuner as kt

In [21]:
# Define number of input features:
number_input_features = len(X_train_scaled[0])

In [22]:
# Define a function to auto-optimise the original nn model
def create_model(hp):
    nn = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=100,
        step=2), activation=activation, input_dim=number_input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn.add(tf.keras.layers.Dense(units=6, activation="softmax"))

    # Compile the model
    nn.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn

In [23]:
# Run the kerastuner search for best hyperparameters
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

tuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))

Trial 60 Complete [00h 00m 10s]
val_accuracy: 0.8083066940307617

Best val_accuracy So Far: 0.9185303449630737
Total elapsed time: 00h 06m 27s


In [24]:
# Get best model hyperparameters
best_hypers = tuner.get_best_hyperparameters(1)[0]
best_hypers.values

{'activation': 'tanh',
 'first_units': 55,
 'num_layers': 1,
 'units_0': 9,
 'units_1': 3,
 'units_2': 1,
 'units_3': 7,
 'units_4': 5,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0012'}

In [25]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  model.build_from_config(
  saveable.load_own_variables(weights_store.get(inner_path))


20/20 - 0s - 20ms/step - accuracy: 0.9185 - loss: 0.3521
Loss: 0.35209617018699646, Accuracy: 0.9185303449630737.
