In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping  # used for convergence criteria
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [3]:
csv_file = "/content/drive/My Drive/Dataset/HIGGS_train.csv"  # path to csv

# Defining column names, Loading data
column_names = ["outcome"] + ["feature "+str(i) for i in range(1,29)]
df = pd.read_csv(csv_file, header=None, names=column_names)

# converting strings to float and removing rows with nan values. (pre-processing)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

  df = pd.read_csv(csv_file, header=None, names=column_names)


In [4]:
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]) #splitting the data

In [5]:
# The code below is used to create a tensorflow dataframe with the training data. Prior to using this dataframe, the RAM would constantly overflow. With this dataframe, however,
# the RAM is far from overflowing. 


train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]) #splitting the data

# Creating tensors from the previous partitions, which will be used below to create the tensorflow dataset.
train_labels = tf.constant(train_df["outcome"].values)
train_features = tf.constant(train_df.drop("outcome", axis=1).values)
val_labels = tf.constant(val_df["outcome"].values)
val_features = tf.constant(val_df.drop("outcome", axis=1).values)
test_labels = tf.constant(test_df["outcome"].values)
test_features = tf.constant(test_df.drop("outcome", axis=1).values)

# Create TensorFlow datasets from the data (features are 1 column, train labels are the other column)
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_features, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

#shuffling and batching the data.
batch_size = 32
train_buffer_size = tf.data.experimental.cardinality(train_dataset).numpy()   # setting buffer of the shuffle function to the maximum number of elements in the respective dataframe
val_buffer_size = tf.data.experimental.cardinality(val_dataset).numpy()
test_buffer_size = tf.data.experimental.cardinality(test_dataset).numpy()

train_dataset = train_dataset.shuffle(buffer_size=train_buffer_size)
train_dataset = train_dataset.batch(batch_size=batch_size)  # in this dataset, every element is a batch, which makes the code memory efficient

val_dataset = val_dataset.shuffle(buffer_size=val_buffer_size)
val_dataset = val_dataset.batch(batch_size=batch_size)

test_dataset = test_dataset.shuffle(buffer_size=test_buffer_size)
test_dataset = test_dataset.batch(batch_size=batch_size)

# prefetching the data that will be used after the current one. 
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)  #tf.data.AUTOTUNE allows tensorflow to automatically determine the right amount of buffer size for prefetch
val_dataset = val_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [6]:
num_elements = tf.data.experimental.cardinality(train_dataset).numpy()

In [7]:
# Number of batches in the tensor
num_datapoints = len(train_dataset)
print(num_datapoints)

11250


In [8]:
# first model
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),


    tf.keras.layers.Dense(1, activation='sigmoid')
])
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

output_model1 = model_1.fit(train_dataset, validation_data=val_dataset, epochs=2,callbacks=[early_stopping])

Epoch 1/2
Epoch 2/2


In [9]:
# testing for accuracy 
loss, accuracy = model_1.evaluate(test_dataset)

print("Test loss:", loss)
print("Test accuracy:", accuracy)


Test loss: 0.5688731074333191
Test accuracy: 0.7002750039100647
