# Chan Kai Yang
# 7372711

## Importing Libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Pre-Processing
### Reading data file

In [2]:
# read csv data file
# load dataset
dataframe = pd.read_csv('train.csv')
dataset = dataframe.values

### Understanding data file

In [3]:
# duplicating and saving the dataframe
df = dataframe.copy()

In [4]:
#Check for null values or missing values and dtype of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21263 entries, 0 to 21262
Data columns (total 82 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   number_of_elements               21263 non-null  int64  
 1   mean_atomic_mass                 21263 non-null  float64
 2   wtd_mean_atomic_mass             21263 non-null  float64
 3   gmean_atomic_mass                21263 non-null  float64
 4   wtd_gmean_atomic_mass            21263 non-null  float64
 5   entropy_atomic_mass              21263 non-null  float64
 6   wtd_entropy_atomic_mass          21263 non-null  float64
 7   range_atomic_mass                21263 non-null  float64
 8   wtd_range_atomic_mass            21263 non-null  float64
 9   std_atomic_mass                  21263 non-null  float64
 10  wtd_std_atomic_mass              21263 non-null  float64
 11  mean_fie                         21263 non-null  float64
 12  wtd_mean_fie      

In [5]:
# viewing dataset
df.head(10)

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0
5,4,88.944468,57.795044,66.361592,36.098926,1.181795,1.225203,122.90607,20.687458,51.968828,...,2.214286,2.213364,2.181543,1.368922,1.141474,1,1.0,0.433013,0.410326,23.0
6,4,88.944468,57.682296,66.361592,36.06947,1.181795,1.316857,122.90607,10.765639,51.968828,...,2.142857,2.213364,2.119268,1.368922,1.194453,1,0.857143,0.433013,0.349927,11.0
7,4,76.517718,57.175142,59.310096,35.891368,1.197273,0.94356,122.90607,36.451199,44.289459,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,33.0
8,4,76.517718,56.808817,59.310096,35.773432,1.197273,0.98188,122.90607,34.83316,44.289459,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,36.0
9,4,76.517718,56.442492,59.310096,35.655884,1.197273,1.016495,122.90607,33.215121,44.289459,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,31.0


In [6]:
# check if there are any null values
df.isnull().sum()

number_of_elements       0
mean_atomic_mass         0
wtd_mean_atomic_mass     0
gmean_atomic_mass        0
wtd_gmean_atomic_mass    0
                        ..
range_Valence            0
wtd_range_Valence        0
std_Valence              0
wtd_std_Valence          0
critical_temp            0
Length: 82, dtype: int64

### Split label and features

In [7]:
# Split label and features
# Drop label col from df
X = df.drop(columns=['critical_temp'])

# y = label col
y = df['critical_temp']

In [8]:
# Split between training (2/3) and testing set (1/3 (~0.33))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Scale dataset between the range 0 and 1 

In [9]:
# scaling the training set between the range 0 and 1
scaler = MinMaxScaler()

# Fit and transform the scaler to the X_train dataset
X_train = scaler.fit_transform(X_train)

In [10]:
# Scale the testing dataset
X_test = scaler.transform(X_test)

In [11]:
# Convert the scaled feature data to a DataFrame
X_train_scaled_df = pd.DataFrame(X_train, columns=X.columns)

X_train_scaled_df

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
0,0.125,0.229878,0.229878,0.125065,0.131789,0.165254,0.167414,0.409424,0.207085,0.421447,...,0.583333,0.583333,0.540440,0.540440,0.297164,0.326461,0.500000,0.214525,0.500000,0.500000
1,0.250,0.311468,0.239874,0.193779,0.135337,0.399711,0.452215,0.610152,0.139701,0.518749,...,0.333333,0.375000,0.314083,0.355028,0.495273,0.474489,0.333333,0.214525,0.272166,0.276385
2,0.500,0.333057,0.225435,0.265167,0.149958,0.732810,0.669710,0.583383,0.105098,0.390477,...,0.200000,0.179936,0.194824,0.177603,0.744255,0.635537,0.166667,0.148254,0.133333,0.090234
3,0.500,0.463315,0.397119,0.339083,0.325351,0.705274,0.673195,0.870150,0.179730,0.615056,...,0.300000,0.211905,0.267529,0.200691,0.715128,0.775541,0.500000,0.079681,0.388730,0.224820
4,0.750,0.536442,0.399692,0.379180,0.308564,0.858843,0.807644,0.927916,0.154856,0.756513,...,0.380952,0.263333,0.324119,0.231763,0.857163,0.876377,0.666667,0.105832,0.525970,0.392088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14241,0.125,0.092567,0.091110,0.092394,0.097932,0.348717,0.346577,0.012870,0.020931,0.013248,...,0.250000,0.231667,0.241582,0.223773,0.314203,0.355395,0.166667,0.007151,0.166667,0.162583
14242,0.375,0.678585,0.599257,0.634915,0.565976,0.668261,0.673101,0.491304,0.158060,0.491298,...,0.791667,0.805556,0.788776,0.803403,0.645842,0.628102,0.166667,0.309869,0.144338,0.124226
14243,0.500,0.438371,0.247656,0.328481,0.154600,0.715555,0.757891,0.755170,0.070453,0.545215,...,0.200000,0.179739,0.194824,0.177437,0.744255,0.660332,0.166667,0.142456,0.133333,0.089616
14244,0.375,0.423503,0.267374,0.302364,0.154828,0.590804,0.508309,0.646050,0.178683,0.549057,...,0.291667,0.217857,0.272025,0.209922,0.626581,0.562149,0.333333,0.151189,0.276385,0.168544


In [12]:
# Convert the scaled feature data to a DataFrame
X_test_scaled_df = pd.DataFrame(X_test, columns=X.columns)
X_test_scaled_df

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
0,0.125,0.375309,0.400399,0.372224,0.402663,0.345613,0.260226,0.097496,0.250607,0.100359,...,0.583333,0.625000,0.578689,0.621451,0.320716,0.263960,0.166667,0.393295,0.166667,0.144338
1,0.375,0.344010,0.368818,0.259433,0.308704,0.604669,0.565921,0.583383,0.179730,0.433810,...,0.208333,0.190476,0.202227,0.186545,0.639097,0.671642,0.166667,0.081724,0.144338,0.116642
2,0.500,0.405840,0.218519,0.313889,0.144436,0.728817,0.778822,0.590973,0.050774,0.460131,...,0.233333,0.185780,0.225360,0.182532,0.741855,0.674033,0.166667,0.138424,0.163299,0.106211
3,0.375,0.344372,0.243559,0.259202,0.148257,0.603526,0.532188,0.590973,0.155264,0.438424,...,0.208333,0.208571,0.202227,0.202441,0.639097,0.553333,0.166667,0.153641,0.144338,0.144611
4,0.250,0.483408,0.408920,0.469111,0.408169,0.539499,0.482219,0.288238,0.163147,0.249705,...,0.666667,0.801874,0.627034,0.790659,0.492502,0.412358,0.500000,0.432496,0.471405,0.242816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7012,0.125,0.385493,0.301637,0.350541,0.282723,0.308043,0.349930,0.325853,0.041484,0.335422,...,0.583333,0.625000,0.578689,0.621451,0.320716,0.263960,0.166667,0.393295,0.166667,0.144338
7013,0.500,0.393203,0.246752,0.276621,0.153950,0.683048,0.700150,0.764368,0.102283,0.594964,...,0.200000,0.177182,0.194824,0.175304,0.744255,0.641686,0.166667,0.146175,0.133333,0.081042
7014,0.250,0.043397,0.042605,0.038246,0.044633,0.518020,0.507312,0.064884,0.027723,0.060365,...,0.333333,0.311111,0.314083,0.295992,0.495273,0.536171,0.333333,0.104879,0.272166,0.239341
7015,0.500,0.375665,0.240847,0.270051,0.150998,0.692098,0.692574,0.679179,0.101264,0.545333,...,0.200000,0.178205,0.194824,0.176156,0.744255,0.628352,0.166667,0.151818,0.133333,0.084615


In [13]:
# standardizing the values of y_train (targe)
scaler.fit(y_train.values.reshape(-1,1))
y_train = scaler.transform(y_train.values.reshape(-1,1))
y_train = y_train.reshape(-1)
y_train = pd.Series(y_train)
y_train

0        0.003495
1        0.019999
2        0.167831
3        0.615384
4        0.755244
           ...   
14241    0.010488
14242    0.040698
14243    0.419580
14244    0.104894
14245    0.004894
Length: 14246, dtype: float64

In [14]:
# standardizing the values of y_test (targe)
y_test = scaler.transform(y_test.values.reshape(-1,1))
y_test = y_test.reshape(-1)
y_test = pd.Series(y_test)
y_test

0       0.044754
1       0.637762
2       0.265733
3       0.132866
4       0.076922
          ...   
7012    0.026572
7013    0.545454
7014    0.136013
7015    0.561538
7016    0.183216
Length: 7017, dtype: float64

### Creating an MLP using the Sequential API 

In [15]:
# 2 hidden layers, 15 neurons every layer until output layer
# Set default input_shape = (81,) (81 columns)
# l1 = 0.01, l2 = 0.01
def build_model(n_hidden = 2, n_neurons = 15, input_shape = (81,),a = 0.01,b=0.01):
    model = keras.models.Sequential()
    #Specifying activation=“relu”
    model.add(keras.layers.Dense(n_neurons, activation='relu',input_shape = input_shape))
    
    #This loop adds n_hidden hidden layers to the model, each with n_neurons neurons and a ReLU activation function.
    #kernel_regularizer specifies the regularization technique to use
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation = 'relu',\
                kernel_regularizer=keras.regularizers.l1_l2(l1=a, l2=b)))
        
    #This adds an output layer to the model with a single neuron.
    #This layer will output a single value, which is the predicted output of the model.
    model.add(keras.layers.Dense(1))
    
    # compiling method to specify the loss function and the optimizer to use
    #training the model using Adaptive Moment Estimation
    model.compile(optimizer='adam', loss = 'mse')
    return model

keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)


n_hidden (default value: 2): Represents the number of hidden layers to be added to the model.

n_neurons (default value: 15): Integer that represents the number of neurons in each layer except output layer.

input_shape (default value: (81,)): 81 columns in dataset.

a and b (default value: 0.01): float values that specify the coefficients for L1 and L2 regularization respectively.

The model architecture starts with an input layer with 81 elements, followed by a single dense layer with the ReLU activation function. This is followed by a loop that adds n_hidden number of dense layers to the model, each with n_neurons neurons and a ReLU activation function. Apply L1 and L2 regularization in hidden layers through the kernel_regularizer argument.

Finally, an output layer with a single neuron is added to the model, which will output a single value representing the predicted output of the model (critical_temp). The loss function used for training is mean squared error (MSE) and the optimizer used is Adaptive Moment Estimation (adam).

I then create a KerasRegressor class wrapper and assign it to keras_reg.

In [16]:
# Fit the model
# with epochs = 50
keras_reg.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x26beef64a00>

Set number of epochs to 50, means that the learning algorithm will iterate over the entire training dataset 50 times during the training process of the ANN model. The model will update its weights after each batch of records based on the error between the predicted and actual output, and the process will continue until the specified number of epochs has been completed.

In [17]:
# evaluating and predicting the model
print (keras_reg.model.evaluate(X_train,y_train))
print (keras_reg.predict(X_train))

0.01950385421514511
[0.02170607 0.02170703 0.38643494 ... 0.46622455 0.13898206 0.02170607]


Before fine-tuning, the ANN model with parameters 'n_hidden': 2, 'n_neurons': 15, 'a': 0.01, 'b': 0.01, has a 
loss of 0.0195 for the training set. Hence,the model is able to predict critical_temp with a relatively small amount of mean squared error on the training set. The loss function used in the model measures the difference between the predicted and actual values of critical_temp, and a lower loss value indicates better performance of the model.

In [18]:
# evaluating and predicting the model
print (keras_reg.model.evaluate(X_test,y_test))
print (keras_reg.predict(X_test))

0.019366053864359856
[0.02170584 0.4938455  0.35725582 ... 0.11517307 0.44203895 0.17427355]


Before fine-tuning, the ANN model with parameters 'n_hidden': 2, 'n_neurons': 15, 'a': 0.01, 'b': 0.01, has a 
loss of 0.0194 for the testing set. Hence,the model is able to predict critical_temp with a relatively small amount of mean squared error on the testing set. The loss function used in the model measures the difference between the predicted and actual values of critical_temp, and a lower loss value indicates better performance of the model.

In [19]:
# Fine-Tuning Neural Network Hyperparameters for the model
from sklearn.model_selection import GridSearchCV
param_distribs = {
    'n_hidden': [2,3,4],
    'n_neurons': [10, 20],
    'a': [0.05, 0.001],
    'b': [0.05, 0.001]
}

Trying to find the optimal parameters in the range specified above for 

'n_hidden': number of hidden layers, 

'n_neurons': number of neurons in each layer, 

'a', 'b': l1, l2 regulators.

In [20]:
import numpy as np

kr_search_cv = GridSearchCV(keras_reg, param_distribs, cv = 5)

# To minimize the loss. We set the metric to be monitored to 'loss'
kr_search_cv.fit(X_train, y_train, epochs=50,\
              callbacks=[keras.callbacks.EarlyStopping(patience=20, monitor="loss")])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 1

GridSearchCV(cv=5,
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x0000026BED663910>,
             param_grid={'a': [0.05, 0.001], 'b': [0.05, 0.001],
                         'n_hidden': [2, 3, 4], 'n_neurons': [10, 20]})

By performing a grid search, the function will create a combination of all possible values of the hyperparameters and evaluate the ANN model with each combination using a cross-validation strategy. The function will return the best combination of hyperparameters based on the evaluation metric specified.

Set number of epochs to 50, means that the learning algorithm will iterate over the entire training dataset 50 times during the training process of the ANN model. The model will update its weights after each batch of records based on the error between the predicted and actual output, and the process will continue until the specified number of epochs has been completed.

In the early stopping criteria, patience=20 means that the training process will stop if the loss value does not improve for 20 consecutive epochs.

In [21]:
kr_search_cv.best_score_

-0.014687004871666432

In [22]:
kr_search_cv.best_params_

{'a': 0.001, 'b': 0.001, 'n_hidden': 2, 'n_neurons': 20}

Best parameters for the ANN model that gives the lowest loss value.

n_hidden = 2, indicating that the best model has 2 hidden layers.

n_neurons = 20, indicating that each hidden layer has 20 neurons.

a = 0.001, indicating that L1 regularization with a parameter of 0.001 is used for the hidden layers.

b = 0.001, indicating that L2 regularization with a parameter of 0.001 is used for the hidden layers.

These hyperparameters were the best selected by GridSearchCV in minimizing the MSE loss function on the dataset.

It is observed that for the same amount of hidden layers, with more neurons (20), and a lower L1, L2 regularization, the loss value is lower. As more neurons per layer can increase the model's capacity to capture complex patterns such as non-linear relationships and interactions between different features. Reducing the L1 and L2 regularization also allow the model to fit the training data more closely and improve its accuracy.

In [23]:
#getting the best model summary
best_keras_model = kr_search_cv.best_estimator_.model
best_keras_model.summary()

Model: "sequential_121"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_604 (Dense)           (None, 20)                1640      
                                                                 
 dense_605 (Dense)           (None, 20)                420       
                                                                 
 dense_606 (Dense)           (None, 20)                420       
                                                                 
 dense_607 (Dense)           (None, 1)                 21        
                                                                 
Total params: 2,501
Trainable params: 2,501
Non-trainable params: 0
_________________________________________________________________


The model has 4 layers, where the first layer is a fully connected dense layer with 20 neurons and 1,640 parameters. There is 1,640 parameters, as each neuron in this layer is fully connected to all 81 input features, and there is one bias term per neuron. Hence, the number of parameters in the first layer is 81 * 20 (neurons) + 20 (biases) = 1,640.

The next two layers each have 20 neurons and 420 parameters, and each neuron is connected to all the neurons in the previous layer, plus a bias term per neuron. Hence, the number of parameters in each of these layers is 20 * 20 (neurons) + 20 (biases) = 420.

Finally, the output layer has a a single neuron, which is the output of the model. It is fully connected to the previous layer with 20 neurons, plus a bias term. Hence, the number of parameters in the output layer is 20 * 1 (neuron) + 1 (bias) = 21.

Adding up the parameters in all layers, we get a total of 1,640 + 420 + 420 + 21 = 2,501 trainable parameters in the model.

In [24]:
# evaluating the model and making predictions after hypertuning 
print (best_keras_model.evaluate(np.array(X_train),np.array(y_train)))
print (best_keras_model.predict(X_train))

0.014234600588679314
[[0.05628663]
 [0.04379757]
 [0.43008348]
 ...
 [0.51349103]
 [0.19925742]
 [0.04006408]]


After fine-tuning, the ANN model with parameters {'a': 0.001, 'b': 0.001, 'n_hidden': 2, 'n_neurons': 20} has a 
loss of 0.0142 for the training set. Hence,the model is able to predict critical_temp with a relatively small amount of mean squared error on the training set. 

After fine-tuning, the loss value is lower, hence a smaller mean squared error, as such, the predictions made by the model after tuning is more accurate, and the model's predictions are closer to the true values.

In [25]:
# evaluating the model and making predictions after hypertuning 
print (best_keras_model.evaluate(np.array(X_test),np.array(y_test)))
print (best_keras_model.predict(X_test))

0.01429026573896408
[[0.03536223]
 [0.5530058 ]
 [0.3868146 ]
 ...
 [0.16368607]
 [0.5030649 ]
 [0.18053827]]


After fine-tuning, the ANN model with parameters {'a': 0.001, 'b': 0.001, 'n_hidden': 2, 'n_neurons': 20} has a 
loss of 0.0143 for the testing set. Hence,the model is able to predict critical_temp with a relatively small amount of mean squared error on the testing set. 

After fine-tuning, the loss value is lower, hence a smaller mean squared error, as such, the predictions made by the model after tuning is more accurate, and the model's predictions are closer to the true values.