In [1]:
import sys

import warnings
warnings.filterwarnings("ignore")

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Dropout
from keras.optimizers import SGD, RMSprop, Adam
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.models import load_model

import numpy as np
import pandas as pd
import time

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from HW5_functions import plot_data, plot_decision_boundary, plot_learning_curve

Using TensorFlow backend.


In [2]:
#--------  generate_fractals
# A function to generate 'n' Sierpinski fractal data points at scale 'm'
# Input: 
#      n (number of data points)
#      m (scale to which the Sierpinski is generated)
# Ouput:
#      n x 3 data set (covariates, labels)

def generate_fractals(n, m):
    # Initial triangle vertices
    v1 = [0,0]
    v2 = [1,0]
    v3 = [.5, 0.99]

    npos = int(np.ceil(n/2.))
    nneg = int(np.floor(n/2.))
    
    # Generate npos positive points over fractal
    posdata = np.zeros((npos,3))
    for i in range(npos):
        curr_point = pick_point(v1, v2, v3, m)

        posdata[i,0] = curr_point[0]
        posdata[i,1] = curr_point[1]
        posdata[i,2] = 1

    # Generate nneg positive points over entire square
    negdata = np.random.rand(nneg,2)
    negdata = np.append(negdata, np.zeros((nneg,1)), axis=1)

    # Combine positive and negative points
    data = np.append(posdata, negdata, axis=0)

    # Return permuted data set
    perm = np.random.permutation(n)
    return data[perm, :]


# Function to compute the midpoint of two points
def midpoint(point1, point2):
    return [(point1[0] + point2[0])/2., (point1[1] + point2[1])/2.]


# Function to compute the center of a triangle based on proportions alpha1, alpha2 and alpha3
def center(vertex1, vertex2, vertex3, alpha1, alpha2, alpha3):
    return [alpha1*vertex1[0] + alpha2*vertex2[0] + alpha3*vertex3[0],\
                   alpha1*vertex1[1] + alpha2*vertex2[1] + alpha3*vertex3[1]]


# Function to draw a random point from a Sierpinski triangle at scale 'n'
def pick_point(vertex1, vertex2, vertex3, n):
    if n == 0:
        alpha1 = np.random.rand()
        alpha2 = np.random.rand()
        alpha3 = np.random.rand()
        tot = alpha1 + alpha2 + alpha3
        return center(vertex1, vertex2, vertex3, alpha1/tot, alpha2/tot, alpha3/tot)
    else:
        val = np.random.randint(0,3)
        if val == 0:
            return pick_point(vertex1, midpoint(vertex1, vertex2), midpoint(vertex1, vertex3), n-1)
        elif val == 1:
            return pick_point(midpoint(vertex2, vertex1), vertex2, midpoint(vertex2, vertex3), n-1)
        else:
            return pick_point(midpoint(vertex3, vertex1), midpoint(vertex3, vertex2), vertex3, n-1)

In [None]:
# np.save('kaggle_data/data_train', data_train)

In [None]:
# data_train = np.load('kaggle_data/data_train.npy')

# df_train = pd.DataFrame(data_train)
# df_train.head()

# X_train = df_train.iloc[:, :2].values
# Y_train = df_train.iloc[:, 2].values

# dev_mask = np.random.choice(len(X_train), size=10000, replace=False)
# X_dev = X_train[dev_mask, :]
# Y_dev = Y_train[dev_mask]

# dev_vali_mask = np.random.choice(10000, size=1000, replace=False)
# X_dev_vali = X_dev[dev_vali_mask, :]
# Y_dev_vali = Y_dev[dev_vali_mask]

In [None]:
# # visualize data
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# plot_data(X_train, Y_train, axes[0], 'Train Data')
# plot_data(X_dev, Y_dev, axes[1], 'Dev Train Data')

In [None]:
# generate training data
#data_train = generate_fractals(100000000, 10)
data_train = np.load('kaggle_data/data_train_large.npy')

In [3]:
data_test_10 = generate_fractals(100000, 10)

In [None]:
# np.save('kaggle_data/data_train_large', data_train)

In [None]:
mask = np.random.choice(len(data_train), size=10000000, replace=False)

X_train = data_train[mask, :2]
Y_train = data_train[mask, 2]

In [4]:
X_vali = data_test_10[:, :2]
Y_vali = data_test_10[:, 2]

In [None]:
# # Feedfoward with 5 layers, 100 nodes, max_norm kernal_constraint
# input_dim = 2
# model = Sequential()
# model.add(Dense(32, input_dim=input_dim, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(32, activation='relu'))

# # model.add(Dropout(0.2))

# model.add(Dense(1, activation='sigmoid')) 
# model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# t0 = time.time()
# model_fit = model.fit(X_train, Y_train, batch_size=1000, epochs=200, verbose=1, 
#                                       validation_data=(X_vali, Y_vali))
# t1 = time.time()
# print('fitting time: {} s'.format(t1 - t0))

In [None]:
# score = model.evaluate(X_vali, Y_vali, verbose=0)
# print('validation loss: {}'.format(score[0]))
# print('validation accuracy: {}'.format(score[1]))

# model.save('kaggle_data/models/my_model_seq.h5')

# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# plot_decision_boundary(X_vali, Y_vali, model, 
#                        "Decision Boundary \nNN - Kaggle", 
#                        axes[0])

# plot_learning_curve(model_fit, axes[1])
# axes[1].set_title("Learning Curve \nNN - Kaggle")
# plt.show()

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(6, 5))
# plot_data(X_train, Y_train, ax, 'Training Data: {}'.format(len(X_train)))

In [7]:
model_seq = load_model('kaggle_data/models/my_model_seq_large.h5')
#model_large = load_model('kaggle_data/models/my_model_n10000000e100.h5')
#model_region = load_model('kaggle_data/models/my_model_seq_region.h5')

In [8]:
score = model_seq.evaluate(X_vali, Y_vali, verbose=0)
print('loss: {}'.format(score[0]))
print('accuracy: {}'.format(score[1]))

loss: 0.27156019155979155
accuracy: 0.89841


In [None]:
# score = model_large.evaluate(X_vali, Y_vali, verbose=0)
# print('loss: {}'.format(score[0]))
# print('accuracy: {}'.format(score[1]))

In [None]:
# score = model_region.evaluate(X_vali, Y_vali, verbose=0)
# print('loss: {}'.format(score[0]))
# print('accuracy: {}'.format(score[1]))

In [None]:
# train_score = model.evaluate(X_train, Y_train, verbose=0)
# print('Train loss: {}'.format(train_score[0]))
# print('Train accuracy: {}'.format(train_score[1]))

# vali_score = model.evaluate(X_vali, Y_vali, verbose=0)
# print('Validation loss: {}'.format(vali_score[0]))
# print('Validation accuracy: {}'.format(vali_score[1]))

In [None]:
model_fit = model_seq.fit(X_train, Y_train, batch_size=1000, epochs=100, verbose=1,
                     validation_data=(X_vali, Y_vali))

In [None]:
model_seq.save('kaggle_data/models/my_model_seq_large.h5')

In [None]:
X_test = pd.read_table('kaggle_data/test_predictors.txt', sep=',', header=None)
X_test.head()

In [None]:
Y_pred_test = model_seq.predict_classes(X_test)

test_prediction = pd.DataFrame()
test_prediction['index'] = np.arange(len(Y_pred_test)).astype(int) + 1
test_prediction['label'] = Y_pred_test.astype(int)
test_prediction.head()

In [None]:
test_prediction.to_csv('kaggle_data/predictions_seq_6-20.txt', index=False)