## Import lib

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
feature_nbr = 2
columns= ['LotArea', 'TotalBsmtSF']
# n_classes = 1

In [8]:
def load_data(mode='train', columns= ['LotArea', 'TotalBsmtSF']):
    """
    Function to (download and) load the MNIST data
    :param mode: train or test
    :return: images and the corresponding labels
    """
    
    if mode == "train":
        data = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
        return data[columns].to_numpy(), data['SalePrice'].to_numpy()
    else:
        data = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
        return data[columns].to_numpy()
        


def randomize(x, y):
    """ Randomizes the order of data samples and their corresponding labels"""
    permutation = np.random.permutation(y.shape[0])
    shuffled_x = x[permutation, :]
    shuffled_y = y[permutation]
    return shuffled_x, shuffled_y


def get_next_batch(x, y, start, end):
    x_batch = x[start:end]
    y_batch = y[start:end]
    return x_batch, y_batch


In [9]:
# Load MNIST data
x_train, y_train = load_data(mode='train',columns=columns)
x_test = load_data(mode='test',columns=columns)
print("Size of:")
print("- Training-set:\t\t{}".format(len(y_train)))

Size of:
- Training-set:		1460


In [10]:
print('x_train:\t{}'.format(x_train.shape))
print('y_train:\t{}'.format(y_train.shape))
print('x_test:\t{}'.format(x_test.shape))

x_train:	(1460, 2)
y_train:	(1460,)
x_test:	(1459, 2)


In [11]:
# Hyper-parameters
epochs = 100             # Total number of training epochs
batch_size = 100        # Training batch size
display_freq = 100      # Frequency of displaying the training results
learning_rate = 0.001   # The optimization initial learning rate

h1 = 200                # number of nodes in the 1st hidden layer

In [12]:
# weight and bais wrappers
def weight_variable(name, shape):
    """
    Create a weight variable with appropriate initialization
    :param name: weight name
    :param shape: weight shape
    :return: initialized weight variable
    """
    return tf.get_variable(
        'W_' + name,
        dtype=tf.float32,
        shape=shape,
        initializer=tf.truncated_normal_initializer(stddev=0.01)
    )


def bias_variable(name, shape):
    """
    Create a bias variable with appropriate initialization
    :param name: bias variable name
    :param shape: bias variable shape
    :return: initialized bias variable
    """
    return tf.get_variable(
        'b_' + name,
        dtype=tf.float32,
        initializer=tf.constant(0., shape=shape, dtype=tf.float32)
    )

In [13]:
def fc_layer(x, num_units, name, use_relu=True):
    """
    Create a fully-connected layer
    :param x: input from previous layer
    :param num_units: number of hidden units in the fully-connected layer
    :param name: layer name
    :param use_relu: boolean to add ReLU non-linearity (or not)
    :return: The output array
    """
    in_dim = x.get_shape()[1]
    W = weight_variable(name, shape=[in_dim, num_units])
    b = bias_variable(name, [num_units])
    layer = tf.matmul(x, W)
    layer += b
    if use_relu:
        layer = tf.nn.relu(layer)
    return layer

In [14]:
# Create the graph for the linear model
# Placeholders for inputs (x) and outputs(y)
x = tf.placeholder(tf.float32, shape=[None, feature_nbr], name='X')
y = tf.placeholder(tf.float32, shape=[None], name='Y')

In [15]:
# Create a fully-connected layer with h1 nodes as hidden layer
fc1 = fc_layer(x, h1, 'FC1', use_relu=True)
# Create a fully-connected layer with n_classes nodes as output layer
y_pred = fc_layer(fc1, 1, 'OUT', use_relu=False)

In [16]:
# Define the loss function, optimizer, and accuracy
loss = tf.reduce_mean(tf.squared_difference(y, y_pred))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='Adam-op').minimize(loss)
#correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y, 1), name='correct_pred')
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')

# Network predictions
#cls_prediction = tf.argmax(y_pred, axis=1, name='predictions')
# TODO : metric RMSLE : root mean squarreds log  error 

### Initialisation des variables

In [17]:
# Create the op for initializing all variables
init = tf.global_variables_initializer()

## Phase entrainement du modèle

In [18]:
# Create an interactive session (to keep the session in the other cells)
sess = tf.InteractiveSession()
# Initialize all variables
sess.run(init)
# Number of training iterations in each epoch
num_tr_iter = int(len(y_train) / batch_size)
for epoch in range(epochs):
    print('Training epoch: {}'.format(epoch + 1))
    # Randomly shuffle the training data at the beginning of each epoch 
    x_train, y_train = randomize(x_train, y_train)
    for iteration in range(num_tr_iter):
        start = iteration * batch_size
        end = (iteration + 1) * batch_size
        x_batch, y_batch = get_next_batch(x_train, y_train, start, end)

        # Run optimization op (backprop)
        feed_dict_batch = {x: x_batch, y: y_batch}
        sess.run(optimizer, feed_dict=feed_dict_batch)


Training epoch: 1
Training epoch: 2
Training epoch: 3
Training epoch: 4
Training epoch: 5
Training epoch: 6
Training epoch: 7
Training epoch: 8
Training epoch: 9
Training epoch: 10
Training epoch: 11
Training epoch: 12
Training epoch: 13
Training epoch: 14
Training epoch: 15
Training epoch: 16
Training epoch: 17
Training epoch: 18
Training epoch: 19
Training epoch: 20
Training epoch: 21
Training epoch: 22
Training epoch: 23
Training epoch: 24
Training epoch: 25
Training epoch: 26
Training epoch: 27
Training epoch: 28
Training epoch: 29
Training epoch: 30
Training epoch: 31
Training epoch: 32
Training epoch: 33
Training epoch: 34
Training epoch: 35
Training epoch: 36
Training epoch: 37
Training epoch: 38
Training epoch: 39
Training epoch: 40
Training epoch: 41
Training epoch: 42
Training epoch: 43
Training epoch: 44
Training epoch: 45
Training epoch: 46
Training epoch: 47
Training epoch: 48
Training epoch: 49
Training epoch: 50
Training epoch: 51
Training epoch: 52
Training epoch: 53
Tr

# Prédiction

## Predict sur le train 



In [19]:
y_train_pred = sess.run(y_pred, feed_dict={x: x_train})

### Metric sur le train

In [20]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error( y_train, y_train_pred ))

0.5503599219084664

## Predict sur le test 



In [21]:
y_test_pred = sess.run(y_pred, feed_dict={x: x_test})

### Metric sur le train

In [22]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error( y_train, y_train_pred ))

0.5503599219084664

# Resultat kaggle

In [None]:
# 0.56624

###  Préparation pour kaggle

In [23]:
y_test_pred[660:665]

array([[     nan],
       [75210.31],
       [43802.62],
       [87577.75],
       [91536.6 ]], dtype=float32)

In [24]:
len(y_test_pred)

1459

In [26]:
data_test = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')

In [27]:
data_result = data_test["Id"].astype(int)
SalePrice = pd.Series(y_test_pred.flatten())

In [28]:
df_result = pd.DataFrame([data_result,SalePrice]).transpose()

In [29]:
df_result.columns = ['Id', 'SalePrice']
df_result["Id"] = df_result["Id"].astype(int)

In [30]:
df_result.to_csv("predict.csv", index=False)