In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
np.set_printoptions(threshold=np.inf)

In [18]:
#Load simple encoded data
X_simple_test = np.load('../../training_sets/Xtest_simple_set_data.npy', allow_pickle=True)
X_simple_train = np.load('../../training_sets/Xtrain_simple_set_data.npy', allow_pickle=True)
X_simple_val = np.load('../../training_sets/Xvalidate_simple_set_data.npy', allow_pickle=True)

Y_simple_test = np.load('../../training_sets/Ytest_simple_set_data.npy', allow_pickle=True)
Y_simple_train = np.load('../../training_sets/Ytrain_simple_set_data.npy', allow_pickle=True)
Y_simple_val = np.load('../../training_sets/Yvalidate_simple_set_data.npy', allow_pickle=True)

In [19]:
# Train simple encoded features
reg = LinearRegression().fit(X_simple_train, Y_simple_train)
print(reg.score(X_simple_train, Y_simple_train))


0.159990826667878


In [20]:
# Make predictions using the training set
y_pred = reg.predict(X_simple_train)
print("Mean squared error train: %.2f" % mean_squared_error(Y_simple_train, y_pred))
print("Mean absolute error train: %.2f" % mean_absolute_error(Y_simple_train, y_pred))

# Make predictions using the testing set
y_pred = reg.predict(X_simple_test)
print("Mean squared error test: %.2f" % mean_squared_error(Y_simple_test, y_pred))
print("Mean absolute error test: %.2f" % mean_absolute_error(Y_simple_test, y_pred))

Mean squared error train: 39858.23
Mean absolute error train: 142.39
Mean squared error test: 29902.91
Mean absolute error test: 125.67


In [21]:
#Load binary encoded data
X_binary_test = np.load('../../training_sets/Xtest_binary_set_data.npy', allow_pickle=True)
X_binary_train = np.load('../../training_sets/Xtrain_binary_set_data.npy', allow_pickle=True)
X_binary_val = np.load('../../training_sets/Xvalidate_binary_set_data.npy', allow_pickle=True)

Y_binary_test = np.load('../../training_sets/Ytest_binary_set_data.npy', allow_pickle=True)
Y_binary_train = np.load('../../training_sets/Ytrain_binary_set_data.npy', allow_pickle=True)
Y_binary_val = np.load('../../training_sets/Yvalidate_binary_set_data.npy', allow_pickle=True)

In [22]:
# Train binary encoded features
reg = LinearRegression().fit(X_binary_train, Y_binary_train)
binary_reg = reg
print(reg.score(X_binary_train, Y_binary_train))

0.22326590169842675


In [23]:
# Make predictions using the training set
y_pred = reg.predict(X_binary_train)
print("Mean squared error train: %.2f" % mean_squared_error(Y_binary_train, y_pred))
print("Mean absolute error train: %.2f" % mean_absolute_error(Y_binary_train, y_pred))

# Make predictions using the testing set
y_pred = reg.predict(X_binary_test)
print("Mean squared error test: %.2f" % mean_squared_error(Y_binary_test, y_pred))
print("Mean absolute error test: %.2f" % mean_absolute_error(Y_binary_test, y_pred))

Mean squared error train: 36855.84
Mean absolute error train: 138.06
Mean squared error test: 30849.09
Mean absolute error test: 129.88


In [24]:
# Load positionally encoded data
# This shouldn't be much different from the binary regression, just for fun though
X_positional_test = np.load('../../training_sets/Xtest_positional_set_data.npy', allow_pickle=True)
X_positional_train = np.load('../../training_sets/Xtrain_positional_set_data.npy', allow_pickle=True)
X_positional_val = np.load('../../training_sets/Xvalidate_positional_set_data.npy', allow_pickle=True)

Y_positional_test = np.load('../../training_sets/Ytest_positional_set_data.npy', allow_pickle=True)
Y_positional_train = np.load('../../training_sets/Ytrain_positional_set_data.npy', allow_pickle=True)
Y_positional_val = np.load('../../training_sets/Yvalidate_positional_set_data.npy', allow_pickle=True)

In [25]:
# Train positional encoded features
reg = LinearRegression().fit(X_positional_train, Y_positional_train)
print(reg.score(X_positional_train, Y_positional_train))

0.22326625773956232


In [26]:
# Make predictions using the training set
y_pred = reg.predict(X_positional_train)
print("Mean squared error train: %.2f" % mean_squared_error(Y_positional_train, y_pred))
print("Mean absolute error train: %.2f" % mean_absolute_error(Y_positional_train, y_pred))

# Make predictions using the testing set
y_pred = reg.predict(X_positional_test)
print("Mean squared error test: %.2f" % mean_squared_error(Y_positional_test, y_pred))
print("Mean absolute error test: %.2f" % mean_absolute_error(Y_positional_test, y_pred))

Mean squared error train: 36855.82
Mean absolute error train: 138.05
Mean squared error test: 30850.44
Mean absolute error test: 129.87


In [28]:
# Analyze the weights
# https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
weights = binary_reg.coef_
ind = np.argpartition(weights, -20)[-20:]
top_20 = weights[ind]
print(ind)
print(top_20)

[160 165 145 147 157 153 142 143 158   5  15  16  17  20  18  19  21  22
  23  24]
[2.36953732e+12 2.36953732e+12 2.36953732e+12 2.36953732e+12
 2.36953732e+12 2.36953733e+12 2.36953733e+12 2.36953733e+12
 2.36953733e+12 4.79727607e+12 4.84304721e+12 4.84304721e+12
 4.84304721e+12 4.84304721e+12 4.84304721e+12 4.84304721e+12
 4.84304721e+12 4.84304721e+12 4.84304721e+12 4.84304721e+12]
