In [None]:
# imports
import pandas 
import numpy 
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from keras.optimizers import Adam
from matplotlib import pyplot
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

In [None]:
# load data
!curl -L -o table.csv "https://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv"
df = pandas.read_csv("table.csv")
df.describe()

In [None]:
"""
Data preproccessing class: inclues different methods for data cleaning, feature engineering and feature scaling.
"""
class data_preproccessing:

  # initialization method
  def __init__(self, data):
    self.dataFrame = data

  # seprates for saprating labels from features 
  def seprate_labels(self):
    labels = self.dataFrame['Price']
    self.dataFrame = self.dataFrame.drop('Price', axis=1)
    return labels

  # removes redundant features from data
  def redundant_feature(self, features):
    self.dataFrame = self.dataFrame.drop(features, axis=1)

  # Splits data into training set and test sets 
  def split_data(self):
    X_train, X_test, Y_train, Y_test = train_test_split(self.dataFrame, self.seprate_labels(), test_size = 0.20)
    return X_train, X_test, Y_train, Y_test

  # It imputes missing values by a new level or by the 'mean' of the feature
  def missing_values(self):
    strings = self.dataFrame.select_dtypes(include=['object']).columns
    nonstrings = self.dataFrame.select_dtypes(exclude=['object']).columns
    for feature in strings:
      print(feature, ': # of missing values:',self.dataFrame[feature].isnull().sum())
      self.dataFrame[feature].fillna('U')
    for feature in nonstrings:
      print(feature, ': # of missing values:',self.dataFrame[feature].isnull().sum())
      self.dataFrame[feature].fillna(self.dataFrame[feature].mean(), inplace=True)

  # Applies one-hot encoding for the nomials features 
  def one_hot_encoding(self, nomial_features):
    for feature in nomial_features:
      one_hot = pd.get_dummies(self.dataFrame[feature], prefix=feature)
      self.dataFrame = self.dataFrame.drop(feature,axis = 1)
      self.dataFrame = self.dataFrame.join(one_hot)
  
  # Converts ordinal features to number 
  def conver_to_num(self, nomial_features):
    for feature in nomial_features:
      self.dataFrame[feature] = self.dataFrame[feature].astype('category').cat.codes   

  # Converts 'Date' feature to datatime type and defines a new feature called 'Age'
  # Age of a house is the difference between it's sold and the date it's built
  # Drops 'YearBuilt' as it does not matter once we have the age of the house
  def hanlde_Date(self):
    # parse date values seprated with slashes as datetime
    self.dataFrame['Date'] = pd.to_datetime(self.dataFrame.Date)
    self.dataFrame['Age'] = self.dataFrame['Date'].dt.year - self.dataFrame['YearBuilt']
    self.dataFrame = self.dataFrame.drop('YearBuilt', axis=1)
    self.dataFrame["Date"] = self.dataFrame["Date"].astype('category').cat.codes 
    self.dataFrame["Age"] = self.dataFrame["Age"].astype('category').cat.codes 

  # Replaces the values of 'Postcode' feature with mean of Price for each level
  def hanlde_Postcode(self):
    for val in housing_prices.Postcode.unique():
      mean = self.dataFrame.loc[self.dataFrame['Postcode'] == val, 'Price'].mean()
      self.dataFrame['Postcode'] = self.dataFrame['Postcode'].replace([val],[mean])

  # Represent latitudes and longitudes via 3 coordinates
  def handle_Lat_Long(self):
    self.dataFrame["x"] = np.sin(np.deg2rad(self.dataFrame['Lattitude']))*np.cos(np.deg2rad(self.dataFrame['Longtitude']))
    self.dataFrame["y"] = np.cos(np.deg2rad(self.dataFrame['Lattitude']))*np.sin(np.deg2rad(self.dataFrame['Longtitude']))
    self.dataFrame["z"] = np.sin(np.deg2rad(self.dataFrame['Lattitude']))  
    self.dataFrame = self.dataFrame.drop('Lattitude', axis=1)  
    self.dataFrame = self.dataFrame.drop('Longtitude', axis=1)
    self.dataFrame = self.dataFrame.drop('Landsize', axis=1)

  # inspect the importance of each feature
  def feature_importance(self):
    regressor = DecisionTreeRegressor(random_state=0)
    X_train, X_test, Y_train, Y_test = split_data(self.dataFrame)
    regressor.fit(X_train, Y_train)
    print(regressor.feature_importances_)

  # performs data normalization 
  def normalization(self):
    X_train, X_test, Y_train, Y_test = self.split_data()
    X_train_norm = X_train.copy()
    X_test_norm = X_test.copy()
    norm = MinMaxScaler().fit(X_train_norm)
    X_train_norm = norm.transform(X_train_norm)
    X_test_norm = norm.transform(X_test_norm) 
    return X_train_norm, X_test_norm, Y_train, Y_test

  # performs data standization 
  def standization(self):
    X_train, X_test, Y_train, Y_test = self.split_data()
    X_train_stand = X_train.copy()
    X_test_stand = X_test.copy()
    norm = StandardScaler().fit(X_train_stand)
    X_train_stand = norm.transform(X_train_stand)
    X_test_stand = norm.transform(X_test_stand) 
    return X_train_stand, X_test_stand, Y_train, Y_test

In [None]:
# build the model
model = Sequential()
model.add(Dense(30, activation="relu", input_dim=456))
model.add(Dense(20, activation="relu"))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="linear"))

# compile the model
model.compile(loss='mean_squared_error', optimizer=Adam(lr=1e-3, decay=1e-3/100))

# fit the model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=50, batch_size=500, verbose=2)

In [None]:
# calculate predictions
pred_test_set = model.predict(X_test)

# save predictions
# numpy.savetxt("test_results.csv", pred_test_set, delimiter=",")

In [None]:
# plot the training history
pyplot.plot(history.history['val_loss'], label='test')
pyplot.plot(history.history['loss'], label='train')
plt.title('model loss')
plt.xlabel('epoch number')
plt.ylabel('total loss for all samples')
pyplot.legend()
pyplot.show()

In [None]:
# plot the predicted values vs the actual values 
# test_results = numpy.genfromtxt("test_results.csv", delimiter=",")
plt.plot(Y_test,pred_test_set,'bo')
plt.title('Test Set')
plt.xlabel('Actual')
plt.ylabel('Predicted')

# compute R-Square value for test set
TestR2Value = r2_score(Y_test,pred_test_set)
print("Test Set R-Square=", round(TestR2Value, 4))