In [1]:
import numpy as np
import tensorflow as tf

## Using tensorflow to build the linear model

In [2]:
# load the data
npz = np.load('train_data.npz')
train_inputs = npz['inputs']
train_targets = npz['targets']

npz = np.load('validation_data.npz')
validation_inputs = npz['inputs']
validation_targets = npz['targets']

npz = np.load('test_data.npz')
test_inputs = npz['inputs']
test_targets = npz['targets']

In [3]:
# set the input size 
input_size = train_inputs.shape[1]
# set the output size
output_size = 1
# set the number of hidden layer
hidden_layer = 100

model = tf.keras.Sequential([
    #tf.keras.layers.Input(shape=(input_size, )),
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(output_size)
])

In [4]:
# set the customized optimizer 
customized_optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

# compile the model with optimizer, loss function, and specified metrics
model.compile(optimizer=customized_optimizer, loss='mean_squared_error', metrics=['mean_squared_error'])

In [5]:
# number of iteration
max_epochs = 100

# divide the data into each batch
batch_size = 100

#  allow the model runs in overfitting twice, no more than three
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# run the model
model.fit(
    train_inputs,
    train_targets,
    epochs=max_epochs,
    batch_size=batch_size,
    validation_data=(validation_inputs, validation_targets),
    verbose=2
)

Epoch 1/100
75/75 - 1s - loss: 11623884800.0000 - mean_squared_error: 11623884800.0000 - val_loss: 10960492544.0000 - val_mean_squared_error: 10960492544.0000 - 1s/epoch - 14ms/step
Epoch 2/100
75/75 - 0s - loss: 8002828800.0000 - mean_squared_error: 8002828800.0000 - val_loss: 10444166144.0000 - val_mean_squared_error: 10444167168.0000 - 160ms/epoch - 2ms/step
Epoch 3/100
75/75 - 0s - loss: 6272423424.0000 - mean_squared_error: 6272423424.0000 - val_loss: 8810565632.0000 - val_mean_squared_error: 8810565632.0000 - 152ms/epoch - 2ms/step
Epoch 4/100
75/75 - 0s - loss: 6079068160.0000 - mean_squared_error: 6079068160.0000 - val_loss: 11097838592.0000 - val_mean_squared_error: 11097838592.0000 - 154ms/epoch - 2ms/step
Epoch 5/100
75/75 - 0s - loss: 5692597248.0000 - mean_squared_error: 5692597248.0000 - val_loss: 12928684032.0000 - val_mean_squared_error: 12928684032.0000 - 150ms/epoch - 2ms/step
Epoch 6/100
75/75 - 0s - loss: 5455290368.0000 - mean_squared_error: 5455290368.0000 - val_l

Epoch 47/100
75/75 - 0s - loss: 3869220352.0000 - mean_squared_error: 3869220352.0000 - val_loss: 5138880512.0000 - val_mean_squared_error: 5138880512.0000 - 167ms/epoch - 2ms/step
Epoch 48/100
75/75 - 0s - loss: 3764952064.0000 - mean_squared_error: 3764952064.0000 - val_loss: 6599880704.0000 - val_mean_squared_error: 6599880704.0000 - 180ms/epoch - 2ms/step
Epoch 49/100
75/75 - 0s - loss: 3713096448.0000 - mean_squared_error: 3713095936.0000 - val_loss: 5067849216.0000 - val_mean_squared_error: 5067849216.0000 - 184ms/epoch - 2ms/step
Epoch 50/100
75/75 - 0s - loss: 3613609472.0000 - mean_squared_error: 3613609472.0000 - val_loss: 6712853504.0000 - val_mean_squared_error: 6712853504.0000 - 180ms/epoch - 2ms/step
Epoch 51/100
75/75 - 0s - loss: 3769623040.0000 - mean_squared_error: 3769623040.0000 - val_loss: 6264707072.0000 - val_mean_squared_error: 6264707072.0000 - 180ms/epoch - 2ms/step
Epoch 52/100
75/75 - 0s - loss: 4202514176.0000 - mean_squared_error: 4202514176.0000 - val_los

Epoch 93/100
75/75 - 0s - loss: 3340406016.0000 - mean_squared_error: 3340406016.0000 - val_loss: 5304987648.0000 - val_mean_squared_error: 5304987648.0000 - 180ms/epoch - 2ms/step
Epoch 94/100
75/75 - 0s - loss: 3334019328.0000 - mean_squared_error: 3334019328.0000 - val_loss: 4458900992.0000 - val_mean_squared_error: 4458900992.0000 - 175ms/epoch - 2ms/step
Epoch 95/100
75/75 - 0s - loss: 3310940160.0000 - mean_squared_error: 3310940160.0000 - val_loss: 4995812864.0000 - val_mean_squared_error: 4995812864.0000 - 170ms/epoch - 2ms/step
Epoch 96/100
75/75 - 0s - loss: 3418933760.0000 - mean_squared_error: 3418933760.0000 - val_loss: 7044173312.0000 - val_mean_squared_error: 7044173312.0000 - 175ms/epoch - 2ms/step
Epoch 97/100
75/75 - 0s - loss: 3297005824.0000 - mean_squared_error: 3297005824.0000 - val_loss: 4496911872.0000 - val_mean_squared_error: 4496911872.0000 - 175ms/epoch - 2ms/step
Epoch 98/100
75/75 - 0s - loss: 3549739520.0000 - mean_squared_error: 3549739520.0000 - val_los

<keras.callbacks.History at 0x1f10aa2bdc0>

In [6]:
test_loss, test_error = model.evaluate(test_inputs, test_targets)
print('The test loss is ' + str(test_loss))
print('The test error is ' + str(test_error))

The test loss is 3605077504.0
The test error is 3605077504.0


## Using scikit-learn to build the model

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### Train data

In [8]:
reg = LinearRegression()
reg.fit(train_inputs, train_targets)
outputs = reg.predict(train_inputs)

In [9]:
rmse = metrics.mean_squared_error(outputs, train_targets, squared=False)
print('The mean squared error in train dataset: ' + str(rmse))

The mean squared error in train dataset: 58942.6721785672


## Validation data

In [10]:
reg = LinearRegression()
reg.fit(validation_inputs, validation_targets)
outputs = reg.predict(validation_inputs)

rmse = metrics.mean_squared_error(outputs, validation_targets, squared=False)
print('The mean squared error in validation dataset: ' + str(rmse))

The mean squared error in validation dataset: 56713.241371586315


### Test Data

In [11]:
reg = LinearRegression()
reg.fit(test_inputs, test_targets)
outputs = reg.predict(test_inputs)

rmse = metrics.mean_squared_error(outputs, test_targets, squared=False)
print('The mean squared error in test dataset: ' + str(rmse))

The mean squared error in test dataset: 57744.47645453902


#### The variables are not quite linear so the mean squared error cannot be lowered to close to 0 and the prediction is not accurate at all