In [None]:
# imports
import json
import math
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.preprocessing
from sklearn import linear_model
import sklearn.model_selection
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support
import keras
from keras.models import Model, Sequential
from keras.layers import Dense, Activation
import keras.backend as K
from keras import optimizers
from keras.models import load_model

In [None]:
K.clear_session()

In [None]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.loss = []
        self.val_acc = []
        
    def on_batch_end(self, batch, logs={}):
        self.loss.append(logs.get('loss'))
        
    def on_epoch_end(self, epoch, logs):
        self.val_acc.append(logs.get('val_acc'))


In [None]:
# def ldf(directory, rows, skip=0):
#     df = pd.read_csv("../input/train_v2.csv",
#                             nrows = rows,
#                             skiprows = skip
#                     )
#     df = df.drop(["date", "socialEngagementType", "visitStartTime", "visitId", "fullVisitorId","hits"], axis=1)
#     devices_df = pd.DataFrame(df.device.apply(json.loads).tolist())[["browser", "operatingSystem", "deviceCategory", "isMobile"]]
#     geo_df = pd.DataFrame(df.geoNetwork.apply(json.loads).tolist())[["continent", "subContinent", "country", "city"]]
#     traffic_source_df = pd.DataFrame(df.trafficSource.apply(json.loads).tolist())[["keyword", "medium", "source"]]
#     totals_df = pd.DataFrame(df.totals.apply(json.loads).tolist())[["transactionRevenue", "newVisits", "bounces", "pageviews", "hits"]]
#     df = pd.concat([df, devices_df, geo_df, traffic_source_df, totals_df], axis=1)
#     df = df.drop(["device", "geoNetwork", "trafficSource", "totals"], axis=1)
#     df["transactionRevenue"] = df["transactionRevenue"].fillna(0)
#     df["bounces"] = df["bounces"].fillna(0)
#     df["pageviews"] = df["pageviews"].fillna(0)
#     df["hits"] = df["hits"].fillna(0)
#     df["newVisits"] = df["newVisits"].fillna(0)
    
#     df["transactionRevenue"] = df["transactionRevenue"].astype(np.float)
    
#     cat_features = ['channelGrouping', 'browser', 'operatingSystem', 'deviceCategory', 'isMobile',
#                         'continent', 'subContinent', 'country', 'city', 'keyword', 'medium', 
#                         'source', 'customDimensions'] #strings -> need to map to floats?
    
#     for c in cat_features:
#         le = preprocessing.LabelEncoder()
#         le.fit(list(df[c].values.astype("str")))
#         df[c] = le.transform(list(df[c].values.astype("str")))
    
#     numerical_features = ['visitNumber', 'newVisits', 'bounces', 'pageviews', "hits"]
#     for c in numerical_features:
#         df[c] = df[c].astype(np.float)
    
# #     df = df.loc[df['transactionRevenue'] != 0.0]
    
#     return df


def ldf(directory, rows, skip=0):
    df = pd.read_csv("../input/train_v2.csv",
                            nrows = rows,
                            skiprows = skip
                    )
    df = df.drop(["date", "socialEngagementType", "visitStartTime", "visitId", "fullVisitorId","hits"], axis=1)
    try:
        devices_df = pd.DataFrame(df.device.apply(json.loads).tolist())[["browser", "operatingSystem", "deviceCategory", "isMobile"]]
    except:
        zero_data = np.zeros(shape=(rows,len(["browser", "operatingSystem", "deviceCategory", "isMobile"])))
        devices_df = pd.DataFrame(zero_data, columns=["browser", "operatingSystem", "deviceCategory", "isMobile"])
    try:
        geo_df = pd.DataFrame(df.geoNetwork.apply(json.loads).tolist())[["continent", "subContinent", "country", "city"]]
    except:
        zero_data = np.zeros(shape=(rows,len(["continent", "subContinent", "country", "city"])))
        geo_df = pd.DataFrame(zero_data, columns=["continent", "subContinent", "country", "city"])
    try:
        traffic_source_df = pd.DataFrame(df.trafficSource.apply(json.loads).tolist())[["keyword", "medium", "source"]]
    except:
        zero_data = np.zeros(shape=(rows,len(["keyword", "medium", "source"])))
        traffic_source_df = pd.DataFrame(zero_data, columns=["keyword", "medium", "source"])
    try:
        totals_df = pd.DataFrame(df.totals.apply(json.loads).tolist())[["transactionRevenue", "newVisits", "bounces", "pageviews", "hits"]]
    except:
        zero_data = np.zeros(shape=(rows,5))
        totals_df = pd.DataFrame(zero_data, columns=["transactionRevenue", "newVisits", "bounces", "pageviews", "hits"])
    
    df = pd.concat([df, devices_df, geo_df, traffic_source_df, totals_df], axis=1)
    df = df.drop(["device", "geoNetwork", "trafficSource", "totals"], axis=1)
    df["transactionRevenue"] = df["transactionRevenue"].fillna(0)
    df["bounces"] = df["bounces"].fillna(0)
    df["pageviews"] = df["pageviews"].fillna(0)
    df["hits"] = df["hits"].fillna(0)
    df["newVisits"] = df["newVisits"].fillna(0)
    
    df["transactionRevenue"] = df["transactionRevenue"].astype(np.float)
    
    cat_features = ['channelGrouping', 'browser', 'operatingSystem', 'deviceCategory', 'isMobile',
                        'continent', 'subContinent', 'country', 'city', 'keyword', 'medium', 
                        'source', 'customDimensions'] #strings -> need to map to floats?
    
    for c in cat_features:
        le = preprocessing.LabelEncoder()
        le.fit(list(df[c].values.astype("str")))
        df[c] = le.transform(list(df[c].values.astype("str")))
    
    numerical_features = ['visitNumber', 'newVisits', 'bounces', 'pageviews', "hits"]
    for c in numerical_features:
        df[c] = df[c].astype(np.float)
        
    df_0 = df.loc[(df['transactionRevenue'] == 0.0)]
    df_0s = df_0.sample(frac=0.75)
    df = df.loc[(df['transactionRevenue'] != 0.0)]
    df = df.append(df_0s, ignore_index=True)
#     r = random.randint(0,9)
#     print("rand",r)
#     df = df.loc[(df['transactionRevenue'] != 0.0) | (random.randint(0,9) > 0)]
#     print(df.shape)
    return df

<h2>First Attempt</h2>

In [None]:
K.clear_session()

In [None]:
%%time
K.clear_session()
number_rows_read = 0
rows = 30000
scores = []
accuracy = []

while (number_rows_read < 90000):
    dftrain = ldf('../input/train_v2.csv', rows, list(range(1,number_rows_read)))
    dftest = ldf('../input/test_v2.csv', rows, list(range(1,number_rows_read)))
    
    y_tr = np.log1p(np.array(dftrain["transactionRevenue"]))
    y_tr = y_tr.reshape(y_tr.shape[0],1)
    dftrain = dftrain.drop(["transactionRevenue"], axis=1)
    x_tr = np.array(dftrain)
    
    y_test = np.log1p(np.array(dftest["transactionRevenue"]))
    y_test = y_test.reshape(y_test.shape[0],1)
    dftest = dftest.drop(["transactionRevenue"], axis=1)
    x_test = np.array(dftest)
    
    # scale the training/test data
    Xtr_scale = (x_tr - np.mean(x_tr, axis = 0)) / np.std(x_tr, axis = 0)
    Xts_scale = (x_test - np.mean(x_test, axis = 0)) / np.std(x_test, axis = 0)
    
    nin = x_tr.shape[1]
    nh = 25
    nout = int(np.max(y_tr)+1)
    model = Sequential()
    model.add(Dense(nh, input_shape=(nin,), activation='sigmoid', name='hidden'))
    model.add(Dense(1, name='output')) 

    history_cb = LossHistory()
    opt = optimizers.Adam(lr = 0.001)
    model.compile(optimizer = opt, loss = 'mean_squared_error', metrics = ['accuracy'])


    batch_size = 10
    model.fit(Xtr_scale, y_tr, epochs = 10, batch_size = batch_size, validation_data = (Xts_scale, y_test), callbacks = [history_cb])
    
    number_rows_read += rows
    
    score, acc = model.evaluate(Xts_scale, y_test)
    scores.append(score)
    accuracy.append(acc)

print("scores", scores)
print("accuracy", accuracy)

In [None]:
test_rows = 2500
number_rows_read = 15000
dftrain = ldf('../input/test_v2.csv',test_rows, list(range(1,number_rows_read)))
df = dftrain.loc[dftrain['transactionRevenue'] != 0.0]

y_test = np.log1p(np.array(df["transactionRevenue"]))
y_test = y_test.reshape((y_test.shape[0],1))
y_test = np.array([y_test[14]])
x_test = np.array(df.drop(["transactionRevenue"], axis=1))
x_test = np.array([x_test[14]])

yhat = model.predict(x_test)
print(yhat)
print(y_test)

print("~"*20)
test_rows = 2500
number_rows_read = 15000
dftrain = ldf('../input/test_v2.csv',test_rows, list(range(1,number_rows_read)))
df = dftrain.loc[dftrain['transactionRevenue'] != 0.0]

y_test = np.log1p(np.array(df["transactionRevenue"]))
y_test = y_test.reshape((y_test.shape[0],1))
x_test = np.array(df.drop(["transactionRevenue"], axis=1))

yhat = model.predict(x_test)
print(np.mean((yhat - y_test) ** 2))

In [None]:
print(model.summary())

<h2>Using different activation functions</h2>

In [None]:
K.clear_session()
scores = []
accuracy = []
activations = ['sigmoid', 'tanh', 'relu']
loss_hist = []
val_acc_hist = []

In [None]:
for a in activations:
    rows = 90000
    number_rows_read = 0

    K.clear_session()

    dftrain = ldf('../input/train_v2.csv', rows, list(range(1,number_rows_read)))
    dftest = ldf('../input/test_v2.csv', rows, list(range(1,number_rows_read)))

    y_tr = np.log1p(np.array(dftrain["transactionRevenue"]))
    y_tr = y_tr.reshape(y_tr.shape[0],1)
    dftrain = dftrain.drop(["transactionRevenue"], axis=1)
    x_tr = np.array(dftrain)

    y_test = np.log1p(np.array(dftest["transactionRevenue"]))
    y_test = y_test.reshape(y_test.shape[0],1)
    dftest = dftest.drop(["transactionRevenue"], axis=1)
    x_test = np.array(dftest)

    # scale the training/test data
    Xtr_scale = (x_tr - np.mean(x_tr, axis = 0)) / np.std(x_tr, axis = 0)
    Xts_scale = (x_test - np.mean(x_test, axis = 0)) / np.std(x_test, axis = 0)

    nin = x_tr.shape[1]
    nh = 25
    model = Sequential()
    model.add(Dense(nh, input_shape=(nin,), activation=a, name='hidden'))
    model.add(Dense(1, name='output')) 

    history_cb = LossHistory()
    opt = optimizers.Adam(lr = 0.001)
    model.compile(optimizer = opt, loss = 'mean_squared_error', metrics = ['accuracy'])
    
    batch_size = 10
    model.fit(Xtr_scale, y_tr, epochs = 10, batch_size = batch_size, validation_data = (Xts_scale, y_test), callbacks = [history_cb])

    number_rows_read += rows

    score, acc = model.evaluate(Xts_scale, y_test)
    scores.append(score)
    accuracy.append(acc)

    loss_hist.append(history_cb.loss)
    val_acc_hist.append(history_cb.val_acc)

print("scores", scores)
print("accuracy", accuracy)

In [None]:
epochs = []
ntr = Xtr_scale.shape[0]
for i in range(len(history_cb.loss)):
    epochs.append(((i+1)*batch_size) / ntr)
plt.semilogy(epochs, loss_hist[0], c="r")
plt.semilogy(epochs, loss_hist[1], c="b")
plt.semilogy(epochs, loss_hist[2], c="g")
plt.ylabel("loss")
plt.xlabel("epochs")
plt.grid()

In [None]:
plt.xlabel("activation")
plt.ylabel("score")
plt.scatter(activations, scores)
plt.show()

<h2>Using different number of hidden nodes</h2>

In [None]:
K.clear_session()
scores = []
accuracy = []
hidden_nodes = [10, 25, 100]
loss_hist_nodes = []
val_acc_hist_nodes = []

In [None]:
for n in hidden_nodes:
    rows = 90000
    number_rows_read = 0

    K.clear_session()

    dftrain = ldf('../input/train_v2.csv', rows, list(range(1,number_rows_read)))
    dftest = ldf('../input/test_v2.csv', rows, list(range(1,number_rows_read)))

    y_tr = np.log1p(np.array(dftrain["transactionRevenue"]))
    y_tr = y_tr.reshape(y_tr.shape[0],1)
    dftrain = dftrain.drop(["transactionRevenue"], axis=1)
    x_tr = np.array(dftrain)

    y_test = np.log1p(np.array(dftest["transactionRevenue"]))
    y_test = y_test.reshape(y_test.shape[0],1)
    dftest = dftest.drop(["transactionRevenue"], axis=1)
    x_test = np.array(dftest)

    # scale the training/test data
    Xtr_scale = (x_tr - np.mean(x_tr, axis = 0)) / np.std(x_tr, axis = 0)
    Xts_scale = (x_test - np.mean(x_test, axis = 0)) / np.std(x_test, axis = 0)

    nin = x_tr.shape[1]
    nh = 25
    model = Sequential()
    model.add(Dense(n, input_shape=(nin,), activation='relu', name='hidden'))
    model.add(Dense(1, name='output')) 

    history_cb = LossHistory()
    opt = optimizers.Adam(lr = 0.001)
    model.compile(optimizer = opt, loss = 'mean_squared_error', metrics = ['accuracy'])
    
    batch_size = 10
    model.fit(Xtr_scale, y_tr, epochs = 10, batch_size = batch_size, validation_data = (Xts_scale, y_test), callbacks = [history_cb])

    number_rows_read += rows

    score, acc = model.evaluate(Xts_scale, y_test)
    scores.append(score)
    accuracy.append(acc)

    loss_hist.append(history_cb.loss)
    val_acc_hist.append(history_cb.val_acc)


print("scores", scores)
print("accuracy", accuracy)

In [None]:
epochs = []
ntr = Xtr_scale.shape[0]
for i in range(len(history_cb.loss)):
    epochs.append(((i+1)*batch_size) / ntr)
plt.semilogy(epochs, loss_hist[0], c="r")
plt.semilogy(epochs, loss_hist[1], c="b")
plt.semilogy(epochs, loss_hist[2], c="g")
plt.ylabel("loss")
plt.xlabel("epochs")
plt.grid()

In [None]:
plt.xlabel("number nodes")
plt.ylabel("score")
plt.scatter(hidden_nodes, scores)
plt.show()

<h2>Using different learning rates</h2>

In [None]:
learning_rates = [0.01, 0.001, 0.0001]
loss_hist_learn = []
val_acc_hist_learn = []
scores = []
accuracy = []


for lr in learning_rates:
    rows = 90000
    number_rows_read = 0

    K.clear_session()

    dftrain = ldf('../input/train_v2.csv', rows, list(range(1,number_rows_read)))
    dftest = ldf('../input/test_v2.csv', rows, list(range(1,number_rows_read)))

    y_tr = np.log1p(np.array(dftrain["transactionRevenue"]))
    y_tr = y_tr.reshape(y_tr.shape[0],1)
    dftrain = dftrain.drop(["transactionRevenue"], axis=1)
    x_tr = np.array(dftrain)

    y_test = np.log1p(np.array(dftest["transactionRevenue"]))
    y_test = y_test.reshape(y_test.shape[0],1)
    dftest = dftest.drop(["transactionRevenue"], axis=1)
    x_test = np.array(dftest)

    # scale the training/test data
    Xtr_scale = (x_tr - np.mean(x_tr, axis = 0)) / np.std(x_tr, axis = 0)
    Xts_scale = (x_test - np.mean(x_test, axis = 0)) / np.std(x_test, axis = 0)

    nin = x_tr.shape[1]
    nh = 25
    model = Sequential()
    model.add(Dense(nh, input_shape=(nin,), activation='relu', name='hidden'))
    model.add(Dense(1, name='output')) 

    history_cb = LossHistory()
    opt = optimizers.Adam(lr = lr)
    model.compile(optimizer = opt, loss = 'mean_squared_error', metrics = ['accuracy'])
    
    batch_size = 10
    model.fit(Xtr_scale, y_tr, epochs = 10, batch_size = batch_size, validation_data = (Xts_scale, y_test), callbacks = [history_cb])

    number_rows_read += rows

    score, acc = model.evaluate(Xts_scale, y_test)
    scores.append(score)
    accuracy.append(acc)

    loss_hist.append(history_cb.loss)
    val_acc_hist.append(history_cb.val_acc)


print("scores", scores)
print("accuracy", accuracy)

In [None]:
plt.xlabel("learning rate")
plt.ylabel("score")
plt.semilogx(learning_rates, scores)
plt.show()