In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
import ydf

np.set_printoptions(precision=3, suppress=True)

#writes predictions to csv
def write_prediction_csv(method_name, predictions):
    with open(f"{method_name}.csv", 'w') as f:
        for line in predictions:
            f.write(f"{line}\n")

# calculation of mean absolute error, from https://stackoverflow.com/questions/74693070/how-can-i-calculate-the-mae-mean-absolute-error-in-pandas
def mae(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.abs(y_true - predictions)) 

In [None]:
#reading in csvs and dropping irrelevant columns
train_csv = pd.read_csv("train.csv")
test_csv = pd.read_csv("test.csv")
X = train_csv.drop('Tm',axis=1) #drop melting point because it is what we are predicting
X = X.drop("SMILES", axis=1) #dropping SMILES string column because it is not numeric
y = train_csv['Tm']

#creatings training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1)
print(len(X_train),len(X_valid),len(X_test))

# extra dataset prep
train_no_smiles = train_csv.drop("SMILES", axis=1)
train_set = train_csv.sample(frac=0.8, random_state=42)
val_set = train_csv.drop(train_set.index)




1916 213 533


#### Baseline

In [24]:
# guesses a random value in the range of the training dataset melting points, calculates an average of 10 iterations
max_val = max(y_train)
min_val = min(y_train)
range_data = max_val - min_val
baseline_predictions = []
iterations = 10
avg = 0
for i in range(iterations):
    for x in range(len(y_valid)):
        random_val = np.random.random()
        random_val += range_data * random_val + min_val
        baseline_predictions.append(random_val)
        if i == 0:
            write_prediction_csv("baseline", baseline_predictions)
    avg += mae(y_valid, baseline_predictions)
    baseline_predictions = []
avg_over_ten = avg/iterations
avg_over_ten


np.float64(278.16407774055955)

### Linear Regression

In [None]:
#Simple linear regression, adapted from https://www.tensorflow.org/tutorials/keras/regression
melting_point_normalizer = tf.keras.layers.Normalization(axis=-1)
melting_point_normalizer.adapt(np.array(X))

linear_model = tf.keras.Sequential([
    melting_point_normalizer,
    layers.Dense(units=1)
])

linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

history = linear_model.fit(
    X,
    y,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

In [23]:
predictions = linear_model.predict(X_test)
print("Mean Absolute Error for linear regression: ",mae(predictions, y_test))
write_prediction_csv("Linear Regression", predictions)

# 1000 epochs Mean Absolute Error for linear regression:  95.91849517822266
# 100 epochs Mean Absolute Error for linear regression:  95.98916625976562

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Mean Absolute Error for linear regression:  154.04845919687776


### Gradient Boosted Trees

In [25]:
# Dropping all those indexes from the dataframe that exists in the train_set
model = ydf.GradientBoostedTreesLearner(task=ydf.Task.REGRESSION, label="Tm", loss="MEAN_AVERAGE_ERROR").train(train_set)
model.evaluate(val_set)
prediction = model.predict(val_set)
actuals = val_set["Tm"]
mae_gradient = mae(actuals,prediction)
write_prediction_csv("GBT", prediction)
print("MAE for gradient boosted trees",mae_gradient)

Train model on 2130 examples
Model trained in 0:00:09.755932
MAE for gradient boosted trees 56.54150070764068


### Neural Network

In [27]:
# modified from https://www.geeksforgeeks.org/deep-learning/implementing-neural-networks-using-tensorflow/

input_shape = [X_train.shape[1]]

input_shape

model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(units=64, activation='relu',input_shape=input_shape),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1)
])
# model.summary()

model.compile(loss=tf.keras.losses.MeanAbsoluteError,
                optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))

losses = model.fit(X_train, y_train,

                #    validation_data=(X_val, y_val),
                   batch_size=512, 
                   epochs=100,  # total epoch
                
                   )


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 276.0394
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 234.1460
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 113.7649 
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 72.9704
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 64.1915
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 60.5794
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 49.6396
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 43.9785
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 40.8581
Epoch 10/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 38.9570
Epoch 1

In [28]:
nn_predictions = model.predict(X_test)
mae_nn = mae(y_test,nn_predictions)
print("MAE for simple neural network", mae_nn)
write_prediction_csv("neuralnetwork", nn_predictions)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
MAE for simple neural network 90.4112337692975


In [None]:
test_no_smiles = test_csv.drop('SMILES', axis=1)

test_dataset = tf.convert_to_tensor(test_no_smiles)
baseline_predictions = model.predict(test_dataset)
test_np = np.array(test_dataset)
print()
ids_preds = list(zip(test_np[:,0], list(baseline_predictions)))
print(ids_preds)
with open('submission.csv', 'w', newline='') as csvfile:
    csvfile.write("id,Tm\n")
    for tup in ids_preds:
        csvfile.write(f"{tup[0]},{tup[1][0]}\n")
