## Data Cleaning

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [2]:
def readdata(data_path, user_id):
    power = pd.read_csv(data_path)
    power = power.loc[(power.user_id == user_id), :]
    data = power.drop(["user_id"], axis=1)
    return data

In [3]:
def splitting_data(data):
    # Separate the data into features and targets
    target_fields = ['power_consumption']
    features, targets = data.drop(target_fields, axis=1), data[target_fields]

    return features, targets

In [4]:
def add_predict(feature, pre):
    feature = feature.append(pre, ignore_index=True)
    return feature

In [5]:
def dummy_variables(data):
    dummy_fields = ['year', 'month', 'day', 'weekday', 'season', 'climate', 'windspeed']

    for each in dummy_fields:
        dummies = pd.get_dummies(data[each], prefix=each, drop_first=False)
        data = pd.concat([data, dummies], axis=1)

    fields_to_drop = ['year', 'month', 'day', 'weekday', 'season', 'climate', 'windspeed']
    data = data.drop(fields_to_drop, axis=1)
    return data

In [6]:
def scaling_variable(data, scaled_features, quant_features):
    # Store scalings in a dictionary so we can convert back later
    for each in quant_features:
        max_, min_ = data[each].max(), data[each].min()
        scaled_features[each] = [max_, min_]
        data.loc[:, each] = (data[each] - min_)/(max_ - min_)
    return data, scaled_features

In [7]:
def splitting_validation_test(feature, target):
    # splitting test set
    train_features, test_features, train_targets = features[:-30], features[-30:], targets
    
    #splitting validation set
    from sklearn.model_selection import train_test_split
    train_features, val_features, train_targets, val_targets = train_test_split(train_features, train_targets, test_size=0.1, random_state=42)
    
    return train_features, train_targets, val_features, val_targets, test_features

In [8]:
def saving_answer(predicton, user_id):
    answer = pd.read_csv("prediction/answer.csv")
    answer["id"+str(user_id)] = predicton.reshape(len(predicton)).tolist()
    answer.to_csv("prediction/answer.csv", index = None)

## Building Model

In [9]:
from distutils.version import LooseVersion
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer.  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0


In [10]:
def leaky_relu(x, alpha=0.2, name='leaky_relu'):
    return tf.maximum(x, alpha * x, name=name)

In [11]:
def get_batches(features,targets, batch_size):
    batch_number = len(features)//batch_size
    for i in range(batch_number+1):
        start = i*batch_size
        end = start + batch_size
        if i == batch_number:
            yield features[start:], targets[start:]
        else:
            yield features[start:end], targets[start:end]


In [12]:
def Create_variable():
    input_ = tf.placeholder(tf.float32, [None, 65], name = "inputs")  # input
    label_ = tf.placeholder(tf.float32, [None, 1], name = "outputs")  # output 
    keep_prob = tf.placeholder(tf.float32, name = "keep_prob")        # probability to keep units
    lr = tf.placeholder(tf.float32, name = "learning_rate")           # learning rate
    
    return input_, label_, keep_prob, lr

In [13]:
def model_loss(ac_fn, keep_prob, input_, label_):
    layer1 = tf.layers.dense(input_, 128, activation = ac_fn, kernel_initializer = tf.contrib.layers.xavier_initializer())
    dropout1 = tf.nn.dropout(layer1, keep_prob)
    
    layer2 = tf.layers.dense(dropout1, 256, activation = ac_fn, kernel_initializer = tf.contrib.layers.xavier_initializer())
    dropout2 = tf.nn.dropout(layer2, keep_prob)

    logits = tf.layers.dense(dropout2, 1, activation = tf.abs, kernel_initializer = tf.contrib.layers.xavier_initializer())

    cost = tf.losses.mean_squared_error(logits, label_)
    
    return logits, cost

In [14]:
def model_opt(cost, learning_rate, beta1 = 0.5):
    optimizer = tf.train.AdamOptimizer(learning_rate, beta1).minimize(cost)
    
    return optimizer

In [15]:
def build_neural_network(losses, epoch_count, batch_size, learning_rate, dropout, train_features, train_targets, 
                         val_features, val_targets, test_features, ac_fn):

    input_, label_, keep_prob, lr = Create_variable()
    logits, cost = model_loss(ac_fn, keep_prob, input_, label_)
    optimizer = model_opt(cost, learning_rate)
    
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch_i in range(epoch_count):
            for x, y in get_batches(train_features.values,train_targets.values, batch_size):
                feed = {input_: x, label_: y,  keep_prob: dropout, lr: learning_rate}
                train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)

            
            feed={input_: val_features.values, label_: val_targets.values, keep_prob: 1}        
            val_loss = sess.run(cost, feed_dict=feed)
            
            
            #if epoch_i % 100 == 0:
        print('Epoch {:>3}/{}   train_loss = {:.5f}   validation_loss = {:.5f}'.format(
                    epoch_i,
                    epoch_count,
                    train_loss,
                    val_loss))
            
            #losses['train'].append(train_loss)
            #losses['validation'].append(val_loss)
            
        # validation result
        feed={input_: val_features.values,  keep_prob: 1}
        validation = sess.run(logits, feed_dict=feed)
        
        # prediction result
        feed={input_: test_features.values,  keep_prob: 1}
        prediction = sess.run(logits, feed_dict=feed)
        
        return losses, validation, prediction

## Check Point

In [16]:
# Hyperparameters
epoch_count = 1000
batch_size = 64
learning_rate = 0.001
dropout = 0.4

In [17]:
def plot_loss(losses, user_id):
    print("losses for user_id " + str(user_id))
    
    plt.plot(losses['train'], label='Training loss')
    plt.plot(losses['validation'], label='Validation loss')
    plt.legend()
    plt.ylim(ymax=0.02, ymin=0)

In [18]:
def draw_validation(validation, val_targets, scaled_features, user_id):
    print("validation test for user_id " + str(user_id))
    
    fig, ax = plt.subplots(figsize=(8,4))
    max_, min_ = scaled_features['power_consumption']
    pre = validation*(max_ - min_) + min_
    ax.plot(pre, label='Prediction')
    ax.scatter(range(len(val_targets)), pre)
    val = (val_targets*(max_ - min_) + min_).values
    ax.plot(val, label='Data')
    ax.scatter(range(len(val_targets)), val)
    ax.legend()

# Test

In [None]:
# Read the prediction set
add_data = pd.read_csv("prediction/predict.csv")
data_path = "EN_Tianchi_power_v2.csv"    
user_id = 300
data = readdata(data_path, user_id)    
# split the dataset into features and targets
features, targets = splitting_data(data)
    
# add the text set to the feature set
features = add_predict(features, add_data)
    
# dummy the categorical variables
features = dummy_variables(features)
    
# Use a dictionary to save the scaling value
scaled_features = {}
    
# Using max-min scaleding for features
quant_features = ["temp"]
features, scaled_features = scaling_variable(features, scaled_features, quant_features)
    
# Using max-min scaleding for targets
quant_features = ["power_consumption"]
targets, scaled_features = scaling_variable(targets, scaled_features, quant_features)
    
# spliting the test set, validation set and training set
train_features, train_targets, val_features, val_targets, test_features = splitting_validation_test(features, targets)
    
# Use a dictionary to save the loss so that we can show as a figure to analysis
losses = {'train':[], 'validation':[]}
    
print("User_id " + str(user_id) + " training:")
# Training the model for each company
losses, validation, prediction = build_neural_network(losses, epoch_count, batch_size, learning_rate, dropout, 
                                                    train_features, train_targets, val_features, val_targets, test_features, leaky_relu)
# plot the loss
plot_loss(losses, user_id)

    # plot the validation test
draw_validation(validation, val_targets, scaled_features, user_id)

In [None]:
# Read the prediction set
add_data = pd.read_csv("prediction/predict.csv")
data_path = "EN_Tianchi_power_v2.csv"    
user_id = 259
data = readdata(data_path, user_id)    
# split the dataset into features and targets
features, targets = splitting_data(data)


In [None]:
targets.head()

# Saving the prediction

In [19]:
# Read the prediction set
add_data = pd.read_csv("prediction/predict.csv")
data_path = "EN_Tianchi_power_v2.csv"    


for user_id in range(1300, 1455):
    # Read the dataset
    data = readdata(data_path, user_id)    
    # split the dataset into features and targets
    features, targets = splitting_data(data)
    
    # add the text set to the feature set
    features = add_predict(features, add_data)
    
    # dummy the categorical variables
    features = dummy_variables(features)
    
    # Use a dictionary to save the scaling value
    scaled_features = {}
    
    # Using max-min scaleding for features
    quant_features = ["temp"]
    features, scaled_features = scaling_variable(features, scaled_features, quant_features)
    
    # Using max-min scaleding for targets
    quant_features = ["power_consumption"]
    targets, scaled_features = scaling_variable(targets, scaled_features, quant_features)
    
    # spliting the test set, validation set and training set
    train_features, train_targets, val_features, val_targets, test_features = splitting_validation_test(features, targets)
    
    # Use a dictionary to save the loss so that we can show as a figure to analysis
    losses = {'train':[], 'validation':[]}
    
    print("User_id " + str(user_id) + " training:")
    # Training the model for each company
    losses, validation, prediction = build_neural_network(losses, epoch_count, batch_size, learning_rate, dropout, 
                                                        train_features, train_targets, val_features, val_targets, test_features, leaky_relu)
    # plot the loss
    #plot_loss(losses, user_id)
    
    # plot the validation test
    #draw_validation(validation, val_targets, scaled_features, user_id)
    
    # read the max and min value to change the prediction into a real form
    max_, min_ = scaled_features['power_consumption']
    
    # saving the prediction
    saving_answer(prediction*(max_ - min_) + min_, user_id)

User_id 1300 training:
Epoch 999/1000   train_loss = 0.00328   validation_loss = 0.01457
User_id 1301 training:
Epoch 999/1000   train_loss = 0.00388   validation_loss = 0.02340
User_id 1302 training:
Epoch 999/1000   train_loss = 0.00628   validation_loss = 0.01797
User_id 1303 training:
Epoch 999/1000   train_loss = 0.01189   validation_loss = 0.12375
User_id 1304 training:
Epoch 999/1000   train_loss = 0.00157   validation_loss = 0.01328
User_id 1305 training:
Epoch 999/1000   train_loss = 0.00335   validation_loss = 0.00432
User_id 1306 training:
Epoch 999/1000   train_loss = 0.00116   validation_loss = 0.00459
User_id 1307 training:
Epoch 999/1000   train_loss = 0.01026   validation_loss = 0.05053
User_id 1308 training:
Epoch 999/1000   train_loss = 0.00236   validation_loss = 0.02064
User_id 1309 training:
Epoch 999/1000   train_loss = 0.00375   validation_loss = 0.01918
User_id 1310 training:
Epoch 999/1000   train_loss = 0.01019   validation_loss = 0.04750
User_id 1311 training

Epoch 999/1000   train_loss = 0.00561   validation_loss = 0.02523
User_id 1393 training:
Epoch 999/1000   train_loss = 0.00193   validation_loss = 0.03424
User_id 1394 training:
Epoch 999/1000   train_loss = 0.00458   validation_loss = 0.01986
User_id 1395 training:
Epoch 999/1000   train_loss = 0.00194   validation_loss = 0.00458
User_id 1396 training:
Epoch 999/1000   train_loss = 0.00182   validation_loss = 0.00908
User_id 1397 training:
Epoch 999/1000   train_loss = 0.00518   validation_loss = 0.03629
User_id 1398 training:
Epoch 999/1000   train_loss = 0.01088   validation_loss = 0.04048
User_id 1399 training:
Epoch 999/1000   train_loss = 0.00261   validation_loss = 0.02413
User_id 1400 training:
Epoch 999/1000   train_loss = 0.00261   validation_loss = 0.01229
User_id 1401 training:
Epoch 999/1000   train_loss = 0.00384   validation_loss = 0.01698
User_id 1402 training:
Epoch 999/1000   train_loss = 0.00239   validation_loss = 0.01518
User_id 1403 training:
Epoch 999/1000   trai