diff --git a/Makefile.am b/Makefile.am
index 955720367..d6210fdc6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,6 +65,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
 sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
 sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
 sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h
+sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
 
 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
 
diff --git a/algorithm.c b/algorithm.c
index 0a3219936..69994d5aa 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -29,6 +29,7 @@
 #include "algorithm/bitblock.h"
 #include "algorithm/x14.h"
 #include "algorithm/fresh.h"
+#include "algorithm/neoscrypt.h"
 
 #include "compat.h"
 
@@ -92,6 +93,17 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
   strcat(data->binary_filename, buf);
 }
 
+static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
+{
+  char buf[255];
+  sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", 
+    (unsigned int)cgpu->thread_concurrency);
+  strcat(data->compiler_options, buf);
+  
+  sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency);
+  strcat(data->binary_filename, buf);
+}
+
 static void append_x11_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
   char buf[255];
@@ -140,6 +152,30 @@ static cl_int queue_scrypt_kernel(struct __clState *clState, struct _dev_blk_ctx
   return status;
 }
 
+static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_uint le_target;
+  cl_int status = 0;
+  
+  /* This looks like a unnecessary double cast, but to make sure, that
+   * the target's most significant entry is adressed as a 32-bit value
+   * and not accidently by something else the double cast seems wise.
+   * The compiler will get rid of it anyway. 
+   */
+  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
+  memcpy(clState->cldata, blk->work->data, 80);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+  
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(le_target);
+  
+  return status;
+}
+
 static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel = &clState->kernel;
@@ -597,6 +633,11 @@ static algorithm_settings_t algos[] = {
   A_SCRYPT( "zuikkis" ),
 #undef A_SCRYPT
 
+#define A_NEOSCRYPT(a) \
+  { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
+  A_NEOSCRYPT("neoscrypt"),
+#undef A_NEOSCRYPT
+
   // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
   { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
diff --git a/algorithm.h b/algorithm.h
index 21499753f..f9e2af1e2 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -23,7 +23,8 @@ typedef enum {
   ALGO_TWE,
   ALGO_FUGUE,
   ALGO_NIST,
-  ALGO_FRESH
+  ALGO_FRESH,
+  ALGO_NEOSCRYPT
 } algorithm_type_t;
 
 extern const char *algorithm_type_str[];
diff --git a/algorithm/neoscrypt.c b/algorithm/neoscrypt.c
new file mode 100644
index 000000000..cec6e5a23
--- /dev/null
+++ b/algorithm/neoscrypt.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2009 Colin Percival, 2011 ArtForz
+ * Copyright (c) 2012 Andrew Moon (floodyberry)
+ * Copyright (c) 2012 Samuel Neves <sneves@dei.uc.pt>
+ * Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "neoscrypt.h"
+
+#define SCRYPT_BLOCK_SIZE 64
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define U8TO32_BE(p) \
+    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+    ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])))
+
+#define U32TO8_BE(p, v) \
+    (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+    (p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U64TO8_BE(p, v) \
+    U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+    U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#if (WINDOWS)
+/* sizeof(unsigned long) = 4 for MinGW64 */
+typedef unsigned long long ulong;
+#else
+typedef unsigned long ulong;
+#endif
+
+typedef unsigned int  uint;
+typedef unsigned char uchar;
+typedef unsigned int  ubool;
+
+
+//#define MIN(a, b) ((a) < (b) ? a : b)
+//#define MAX(a, b) ((a) > (b) ? a : b)
+
+
+#if (SHA256)
+
+/* SHA-256 */
+
+static const uint32_t sha256_constants[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR32(x,  2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)      (ROTR32(x,  6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define G0(x)      (ROTR32(x,  7) ^ ROTR32(x, 18) ^ (x >>  3))
+#define G1(x)      (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
+#define W0(in,i)   (U8TO32_BE(&in[i * 4]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+    t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+    t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
+    r[7] = r[6]; \
+    r[6] = r[5]; \
+    r[5] = r[4]; \
+    r[4] = r[3] + t0; \
+    r[3] = r[2]; \
+    r[2] = r[1]; \
+    r[1] = r[0]; \
+    r[0] = t0 + t1;
+
+
+typedef struct sha256_hash_state_t {
+    uint32_t H[8];
+    uint64_t T;
+    uint32_t leftover;
+    uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} sha256_hash_state;
+
+
+static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) {
+    uint32_t r[8], w[64], t0, t1;
+    size_t i;
+
+    for(i = 0; i < 8; i++)
+      r[i] = S->H[i];
+
+    while(blocks--) {
+        for(i =  0; i < 16; i++) {
+            w[i] = W0(in, i);
+        }
+        for(i = 16; i < 64; i++) {
+            w[i] = W1(i);
+        }
+        for(i =  0; i < 64; i++) {
+            STEP(i);
+        }
+        for(i =  0; i <  8; i++) {
+            r[i] += S->H[i];
+            S->H[i] = r[i];
+        }
+        S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
+        in += SCRYPT_HASH_BLOCK_SIZE;
+    }
+}
+
+static void neoscrypt_hash_init_sha256(sha256_hash_state *S) {
+    S->H[0] = 0x6a09e667;
+    S->H[1] = 0xbb67ae85;
+    S->H[2] = 0x3c6ef372;
+    S->H[3] = 0xa54ff53a;
+    S->H[4] = 0x510e527f;
+    S->H[5] = 0x9b05688c;
+    S->H[6] = 0x1f83d9ab;
+    S->H[7] = 0x5be0cd19;
+    S->T = 0;
+    S->leftover = 0;
+}
+
+static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) {
+    size_t blocks, want;
+
+    /* handle the previous data */
+    if(S->leftover) {
+        want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+        want = (want < inlen) ? want : inlen;
+        memcpy(S->buffer + S->leftover, in, want);
+        S->leftover += (uint32_t)want;
+        if(S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+          return;
+        in += want;
+        inlen -= want;
+        sha256_blocks(S, S->buffer, 1);
+    }
+
+    /* handle the current data */
+    blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+    S->leftover = (uint32_t)(inlen - blocks);
+    if(blocks) {
+        sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+        in += blocks;
+    }
+
+    /* handle leftover data */
+    if(S->leftover)
+      memcpy(S->buffer, in, S->leftover);
+}
+
+static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) {
+    uint64_t t = S->T + (S->leftover * 8);
+
+    S->buffer[S->leftover] = 0x80;
+    if(S->leftover <= 55) {
+        memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+    } else {
+        memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+        sha256_blocks(S, S->buffer, 1);
+        memset(S->buffer, 0, 56);
+    }
+
+    U64TO8_BE(S->buffer + 56, t);
+    sha256_blocks(S, S->buffer, 1);
+
+    U32TO8_BE(&hash[ 0], S->H[0]);
+    U32TO8_BE(&hash[ 4], S->H[1]);
+    U32TO8_BE(&hash[ 8], S->H[2]);
+    U32TO8_BE(&hash[12], S->H[3]);
+    U32TO8_BE(&hash[16], S->H[4]);
+    U32TO8_BE(&hash[20], S->H[5]);
+    U32TO8_BE(&hash[24], S->H[6]);
+    U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) {
+    sha256_hash_state st;
+    neoscrypt_hash_init_sha256(&st);
+    neoscrypt_hash_update_sha256(&st, m, mlen);
+    neoscrypt_hash_finish_sha256(&st, hash);
+}
+
+
+/* HMAC for SHA-256 */
+
+typedef struct sha256_hmac_state_t {
+    sha256_hash_state inner, outer;
+} sha256_hmac_state;
+
+static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
+    uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+    size_t i;
+
+    neoscrypt_hash_init_sha256(&st->inner);
+    neoscrypt_hash_init_sha256(&st->outer);
+
+    if(keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+        /* use the key directly if it's <= blocksize bytes */
+        memcpy(pad, key, keylen);
+    } else {
+        /* if it's > blocksize bytes, hash it */
+        neoscrypt_hash_sha256(pad, key, keylen);
+    }
+
+    /* inner = (key ^ 0x36) */
+    /* h(inner || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= 0x36;
+    neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+    /* outer = (key ^ 0x5c) */
+    /* h(outer || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= (0x5c ^ 0x36);
+    neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) {
+    /* h(inner || m...) */
+    neoscrypt_hash_update_sha256(&st->inner, m, mlen);
+}
+
+static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) {
+    /* h(inner || m) */
+    hash_digest innerhash;
+    neoscrypt_hash_finish_sha256(&st->inner, innerhash);
+
+    /* h(outer || h(inner || m)) */
+    neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash));
+    neoscrypt_hash_finish_sha256(&st->outer, mac);
+}
+
+
+/* PBKDF2 for SHA-256 */
+
+static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len,
+  const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) {
+    sha256_hmac_state hmac_pw, hmac_pw_salt, work;
+    hash_digest ti, u;
+    uint8_t be[4];
+    uint32_t i, j, k, blocks;
+
+    /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+    /* hmac(password, ...) */
+    neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len);
+
+    /* hmac(password, salt...) */
+    hmac_pw_salt = hmac_pw;
+    neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len);
+
+    blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+    for(i = 1; i <= blocks; i++) {
+        /* U1 = hmac(password, salt || be(i)) */
+        U32TO8_BE(be, i);
+        work = hmac_pw_salt;
+        neoscrypt_hmac_update_sha256(&work, be, 4);
+        neoscrypt_hmac_finish_sha256(&work, ti);
+        memcpy(u, ti, sizeof(u));
+
+        /* T[i] = U1 ^ U2 ^ U3... */
+        for(j = 0; j < N - 1; j++) {
+            /* UX = hmac(password, U{X-1}) */
+            work = hmac_pw;
+            neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+            neoscrypt_hmac_finish_sha256(&work, u);
+
+            /* T[i] ^= UX */
+            for(k = 0; k < sizeof(u); k++)
+              ti[k] ^= u[k];
+        }
+
+        memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
+        output += SCRYPT_HASH_DIGEST_SIZE;
+        output_len -= SCRYPT_HASH_DIGEST_SIZE;
+    }
+}
+
+#endif
+
+
+#if (BLAKE256)
+
+/* BLAKE-256 */
+
+const uint8_t blake256_sigma[] = {
+     0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+    14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+    11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+     7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+     9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+     2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+    12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+    13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+     6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+    10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint32_t blake256_constants[16] = {
+    0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
+    0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
+};
+
+typedef struct blake256_hash_state_t {
+    uint32_t H[8], T[2];
+    uint32_t leftover;
+    uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} blake256_hash_state;
+
+static void blake256_blocks(blake256_hash_state *S, const uint8_t *in, size_t blocks) {
+    const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
+    uint32_t m[16], v[16], h[8], t[2];
+    uint32_t i;
+
+    for(i = 0; i < 8; i++)
+      h[i] = S->H[i];
+    for(i = 0; i < 2; i++)
+      t[i] = S->T[i];
+
+    while(blocks--) {
+        t[0] += 512;
+        t[1] += (t[0] < 512) ? 1 : 0;
+
+        for(i = 0; i <  8; i++)
+          v[i] = h[i];
+        for(i = 0; i <  4; i++)
+          v[i + 8] = blake256_constants[i];
+        for(i = 0; i <  2; i++)
+          v[i + 12] = blake256_constants[i+4] ^ t[0];
+        for(i = 0; i <  2; i++)
+          v[i + 14] = blake256_constants[i+6] ^ t[1];
+
+        for(i = 0; i < 16; i++)
+          m[i] = U8TO32_BE(&in[i * 4]);
+
+        in += 64;
+
+#define G(a,b,c,d,e) \
+    v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
+    v[d] = ROTR32(v[d] ^ v[a],16); \
+    v[c] += v[d]; \
+    v[b] = ROTR32(v[b] ^ v[c],12); \
+    v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
+    v[d] = ROTR32(v[d] ^ v[a], 8); \
+    v[c] += v[d]; \
+    v[b] = ROTR32(v[b] ^ v[c], 7);
+
+        for(i = 0, sigma = blake256_sigma; i < 14; i++) {
+            G(0, 4, 8,12, 0);
+            G(1, 5, 9,13, 2);
+            G(2, 6,10,14, 4);
+            G(3, 7,11,15, 6);
+
+            G(0, 5,10,15, 8);
+            G(1, 6,11,12,10);
+            G(2, 7, 8,13,12);
+            G(3, 4, 9,14,14);
+
+            sigma += 16;
+            if(sigma == sigma_end)
+              sigma = blake256_sigma;
+        }
+
+#undef G
+
+        for(i = 0; i < 8; i++)
+          h[i] ^= (v[i] ^ v[i + 8]);
+    }
+
+    for(i = 0; i < 8; i++)
+      S->H[i] = h[i];
+    for(i = 0; i < 2; i++)
+      S->T[i] = t[i];
+}
+
+static void neoscrypt_hash_init_blake256(blake256_hash_state *S) {
+    S->H[0] = 0x6a09e667ULL;
+    S->H[1] = 0xbb67ae85ULL;
+    S->H[2] = 0x3c6ef372ULL;
+    S->H[3] = 0xa54ff53aULL;
+    S->H[4] = 0x510e527fULL;
+    S->H[5] = 0x9b05688cULL;
+    S->H[6] = 0x1f83d9abULL;
+    S->H[7] = 0x5be0cd19ULL;
+    S->T[0] = 0;
+    S->T[1] = 0;
+    S->leftover = 0;
+}
+
+static void neoscrypt_hash_update_blake256(blake256_hash_state *S, const uint8_t *in, size_t inlen) {
+    size_t blocks, want;
+
+    /* handle the previous data */
+    if(S->leftover) {
+        want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+        want = (want < inlen) ? want : inlen;
+        memcpy(S->buffer + S->leftover, in, want);
+        S->leftover += (uint32_t)want;
+        if(S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+          return;
+        in += want;
+        inlen -= want;
+        blake256_blocks(S, S->buffer, 1);
+    }
+
+    /* handle the current data */
+    blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+    S->leftover = (uint32_t)(inlen - blocks);
+    if(blocks) {
+        blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+        in += blocks;
+    }
+
+    /* handle leftover data */
+    if(S->leftover)
+      memcpy(S->buffer, in, S->leftover);
+}
+
+static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, uint8_t *hash) {
+    uint32_t th, tl, bits;
+
+    bits = (S->leftover << 3);
+    tl = S->T[0] + bits;
+    th = S->T[1];
+    if(S->leftover == 0) {
+        S->T[0] = (uint32_t)0 - (uint32_t)512;
+        S->T[1] = (uint32_t)0 - (uint32_t)1;
+    } else if(S->T[0] == 0) {
+        S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
+        S->T[1] = S->T[1] - 1;
+    } else {
+        S->T[0] -= (512 - bits);
+    }
+
+    S->buffer[S->leftover] = 0x80;
+    if(S->leftover <= 55) {
+        memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+    } else {
+        memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+        blake256_blocks(S, S->buffer, 1);
+        S->T[0] = (uint32_t)0 - (uint32_t)512;
+        S->T[1] = (uint32_t)0 - (uint32_t)1;
+        memset(S->buffer, 0, 56);
+    }
+    S->buffer[55] |= 1;
+    U32TO8_BE(S->buffer + 56, th);
+    U32TO8_BE(S->buffer + 60, tl);
+    blake256_blocks(S, S->buffer, 1);
+
+    U32TO8_BE(&hash[ 0], S->H[0]);
+    U32TO8_BE(&hash[ 4], S->H[1]);
+    U32TO8_BE(&hash[ 8], S->H[2]);
+    U32TO8_BE(&hash[12], S->H[3]);
+    U32TO8_BE(&hash[16], S->H[4]);
+    U32TO8_BE(&hash[20], S->H[5]);
+    U32TO8_BE(&hash[24], S->H[6]);
+    U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static void neoscrypt_hash_blake256(hash_digest hash, const uint8_t *m, size_t mlen) {
+    blake256_hash_state st;
+    neoscrypt_hash_init_blake256(&st);
+    neoscrypt_hash_update_blake256(&st, m, mlen);
+    neoscrypt_hash_finish_blake256(&st, hash);
+}
+
+
+/* HMAC for BLAKE-256 */
+
+typedef struct blake256_hmac_state_t {
+    blake256_hash_state inner, outer;
+} blake256_hmac_state;
+
+static void neoscrypt_hmac_init_blake256(blake256_hmac_state *st, const uint8_t *key, size_t keylen) {
+    uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+    size_t i;
+
+    neoscrypt_hash_init_blake256(&st->inner);
+    neoscrypt_hash_init_blake256(&st->outer);
+
+    if(keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+        /* use the key directly if it's <= blocksize bytes */
+        memcpy(pad, key, keylen);
+    } else {
+        /* if it's > blocksize bytes, hash it */
+        neoscrypt_hash_blake256(pad, key, keylen);
+    }
+
+    /* inner = (key ^ 0x36) */
+    /* h(inner || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= 0x36;
+    neoscrypt_hash_update_blake256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+    /* outer = (key ^ 0x5c) */
+    /* h(outer || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= (0x5c ^ 0x36);
+    neoscrypt_hash_update_blake256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void neoscrypt_hmac_update_blake256(blake256_hmac_state *st, const uint8_t *m, size_t mlen) {
+    /* h(inner || m...) */
+    neoscrypt_hash_update_blake256(&st->inner, m, mlen);
+}
+
+static void neoscrypt_hmac_finish_blake256(blake256_hmac_state *st, hash_digest mac) {
+    /* h(inner || m) */
+    hash_digest innerhash;
+    neoscrypt_hash_finish_blake256(&st->inner, innerhash);
+
+    /* h(outer || h(inner || m)) */
+    neoscrypt_hash_update_blake256(&st->outer, innerhash, sizeof(innerhash));
+    neoscrypt_hash_finish_blake256(&st->outer, mac);
+}
+
+
+/* PBKDF2 for BLAKE-256 */
+
+static void neoscrypt_pbkdf2_blake256(const uint8_t *password, size_t password_len,
+  const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) {
+    blake256_hmac_state hmac_pw, hmac_pw_salt, work;
+    hash_digest ti, u;
+    uint8_t be[4];
+    uint32_t i, j, k, blocks;
+
+    /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+    /* hmac(password, ...) */
+    neoscrypt_hmac_init_blake256(&hmac_pw, password, password_len);
+
+    /* hmac(password, salt...) */
+    hmac_pw_salt = hmac_pw;
+    neoscrypt_hmac_update_blake256(&hmac_pw_salt, salt, salt_len);
+
+    blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+    for(i = 1; i <= blocks; i++) {
+        /* U1 = hmac(password, salt || be(i)) */
+        U32TO8_BE(be, i);
+        work = hmac_pw_salt;
+        neoscrypt_hmac_update_blake256(&work, be, 4);
+        neoscrypt_hmac_finish_blake256(&work, ti);
+        memcpy(u, ti, sizeof(u));
+
+        /* T[i] = U1 ^ U2 ^ U3... */
+        for(j = 0; j < N - 1; j++) {
+            /* UX = hmac(password, U{X-1}) */
+            work = hmac_pw;
+            neoscrypt_hmac_update_blake256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+            neoscrypt_hmac_finish_blake256(&work, u);
+
+            /* T[i] ^= UX */
+            for(k = 0; k < sizeof(u); k++)
+              ti[k] ^= u[k];
+        }
+
+        memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
+        output += SCRYPT_HASH_DIGEST_SIZE;
+        output_len -= SCRYPT_HASH_DIGEST_SIZE;
+    }
+}
+
+#endif
+
+
+/* NeoScrypt */
+
+#if defined(ASM)
+
+extern void neoscrypt_salsa(uint *X, uint rounds);
+extern void neoscrypt_salsa_tangle(uint *X, uint count);
+extern void neoscrypt_chacha(uint *X, uint rounds);
+
+extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len);
+extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len);
+extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len);
+
+#else
+
+/* Salsa20, rounds must be a multiple of 2 */
+static void neoscrypt_salsa(uint *X, uint rounds) {
+    uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+    x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+    x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+    x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a, b, c, d) \
+    t = a + d; t = ROTL32(t,  7); b ^= t; \
+    t = b + a; t = ROTL32(t,  9); c ^= t; \
+    t = c + b; t = ROTL32(t, 13); d ^= t; \
+    t = d + c; t = ROTL32(t, 18); a ^= t;
+
+    for(; rounds; rounds -= 2) {
+        quarter( x0,  x4,  x8, x12);
+        quarter( x5,  x9, x13,  x1);
+        quarter(x10, x14,  x2,  x6);
+        quarter(x15,  x3,  x7, x11);
+        quarter( x0,  x1,  x2,  x3);
+        quarter( x5,  x6,  x7,  x4);
+        quarter(x10, x11,  x8,  x9);
+        quarter(x15, x12, x13, x14);
+    }
+
+    X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+    X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+    X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+/* ChaCha20, rounds must be a multiple of 2 */
+static void neoscrypt_chacha(uint *X, uint rounds) {
+    uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+    x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+    x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+    x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a,b,c,d) \
+    a += b; t = d ^ a; d = ROTL32(t, 16); \
+    c += d; t = b ^ c; b = ROTL32(t, 12); \
+    a += b; t = d ^ a; d = ROTL32(t,  8); \
+    c += d; t = b ^ c; b = ROTL32(t,  7);
+
+    for(; rounds; rounds -= 2) {
+        quarter( x0,  x4,  x8, x12);
+        quarter( x1,  x5,  x9, x13);
+        quarter( x2,  x6, x10, x14);
+        quarter( x3,  x7, x11, x15);
+        quarter( x0,  x5, x10, x15);
+        quarter( x1,  x6, x11, x12);
+        quarter( x2,  x7,  x8, x13);
+        quarter( x3,  x4,  x9, x14);
+    }
+
+    X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+    X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+    X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+
+/* Fast 32-bit / 64-bit memcpy();
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        dst[i]     = src[i];
+        dst[i + 1] = src[i + 1];
+        dst[i + 2] = src[i + 2];
+        dst[i + 3] = src[i + 3];
+    }
+}
+
+/* Fast 32-bit / 64-bit block swapper;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) {
+    ulong *blkA = (ulong *) blkAp;
+    ulong *blkB = (ulong *) blkBp;
+    register ulong t0, t1, t2, t3;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        t0          = blkA[i];
+        t1          = blkA[i + 1];
+        t2          = blkA[i + 2];
+        t3          = blkA[i + 3];
+        blkA[i]     = blkB[i];
+        blkA[i + 1] = blkB[i + 1];
+        blkA[i + 2] = blkB[i + 2];
+        blkA[i + 3] = blkB[i + 3];
+        blkB[i]     = t0;
+        blkB[i + 1] = t1;
+        blkB[i + 2] = t2;
+        blkB[i + 3] = t3;
+    }
+}
+
+/* Fast 32-bit / 64-bit block XOR engine;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        dst[i]     ^= src[i];
+        dst[i + 1] ^= src[i + 1];
+        dst[i + 2] ^= src[i + 2];
+        dst[i + 3] ^= src[i + 3];
+    }
+}
+
+#endif
+
+/* 32-bit / 64-bit optimised memcpy() */
+static void neoscrypt_copy(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] = src[i];
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+        uchar *srcb = (uchar *) srcp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] = srcb[i];
+    }
+}
+
+/* 32-bit / 64-bit optimised memory erase aka memset() to zero */
+static void neoscrypt_erase(void *dstp, uint len) {
+    const ulong null = 0;
+    ulong *dst = (ulong *) dstp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] = null;
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] = (uchar)null;
+    }
+}
+
+/* 32-bit / 64-bit optimised XOR engine */
+static void neoscrypt_xor(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] ^= src[i];
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+        uchar *srcb = (uchar *) srcp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] ^= srcb[i];
+    }
+}
+
+
+/* BLAKE2s */
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+/* Parameter block of 32 bytes */
+typedef struct blake2s_param_t {
+    uchar digest_length;
+    uchar key_length;
+    uchar fanout;
+    uchar depth;
+    uint  leaf_length;
+    uchar node_offset[6];
+    uchar node_depth;
+    uchar inner_length;
+    uchar salt[8];
+    uchar personal[8];
+} blake2s_param;
+
+/* State block of 180 bytes */
+typedef struct blake2s_state_t {
+    uint  h[8];
+    uint  t[2];
+    uint  f[2];
+    uchar buf[2 * BLAKE2S_BLOCK_SIZE];
+    uint  buflen;
+} blake2s_state;
+
+static const uint blake2s_IV[8] = {
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static void blake2s_compress(blake2s_state *S, const uint *buf) {
+    uint i;
+    uint m[16];
+    uint v[16];
+
+    neoscrypt_copy(m, buf, 64);
+    neoscrypt_copy(v, S, 32);
+
+    v[ 8] = blake2s_IV[0];
+    v[ 9] = blake2s_IV[1];
+    v[10] = blake2s_IV[2];
+    v[11] = blake2s_IV[3];
+    v[12] = S->t[0] ^ blake2s_IV[4];
+    v[13] = S->t[1] ^ blake2s_IV[5];
+    v[14] = S->f[0] ^ blake2s_IV[6];
+    v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+    d = ROTR32(d ^ a, 16); \
+    c = c + d; \
+    b = ROTR32(b ^ c, 12); \
+    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+    d = ROTR32(d ^ a, 8); \
+    c = c + d; \
+    b = ROTR32(b ^ c, 7); \
+  } while(0)
+#define ROUND(r) \
+  do { \
+    G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
+    G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
+    G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
+    G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
+    G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
+    G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
+    G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
+    G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
+  } while(0)
+    ROUND(0);
+    ROUND(1);
+    ROUND(2);
+    ROUND(3);
+    ROUND(4);
+    ROUND(5);
+    ROUND(6);
+    ROUND(7);
+    ROUND(8);
+    ROUND(9);
+
+  for(i = 0; i < 8; i++)
+    S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+}
+
+static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) {
+    uint left, fill;
+
+    while(input_size > 0) {
+        left = S->buflen;
+        fill = 2 * BLAKE2S_BLOCK_SIZE - left;
+        if(input_size > fill) {
+            /* Buffer fill */
+            neoscrypt_copy(S->buf + left, input, fill);
+            S->buflen += fill;
+            /* Counter increment */
+            S->t[0] += BLAKE2S_BLOCK_SIZE;
+            /* Compress */
+            blake2s_compress(S, (uint *) S->buf);
+            /* Shift buffer left */
+            neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE);
+            S->buflen -= BLAKE2S_BLOCK_SIZE;
+            input += fill;
+            input_size -= fill;
+        } else {
+            neoscrypt_copy(S->buf + left, input, input_size);
+            S->buflen += input_size;
+            /* Do not compress */
+            input += input_size;
+            input_size = 0;
+        }
+    }
+}
+
+static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size,
+  void *output, const uchar output_size) {
+    uchar block[BLAKE2S_BLOCK_SIZE];
+    blake2s_param P[1];
+    blake2s_state S[1];
+
+    /* Initialise */
+    neoscrypt_erase(P, 32);
+    P->digest_length = output_size;
+    P->key_length    = key_size;
+    P->fanout        = 1;
+    P->depth         = 1;
+
+    neoscrypt_erase(S, 180);
+    neoscrypt_copy(S, blake2s_IV, 32);
+    neoscrypt_xor(S, P, 32);
+
+    neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
+    neoscrypt_copy(block, key, key_size);
+    blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE);
+
+    /* Update */
+    blake2s_update(S, (uchar *) input, input_size);
+
+    /* Finish */
+    if(S->buflen > BLAKE2S_BLOCK_SIZE) {
+        S->t[0] += BLAKE2S_BLOCK_SIZE;
+        blake2s_compress(S, (uint *) S->buf);
+        S->buflen -= BLAKE2S_BLOCK_SIZE;
+        neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen);
+    }
+    S->t[0] += S->buflen;
+    S->f[0] = ~0U;
+    neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen);
+    blake2s_compress(S, (uint *) S->buf);
+    /* Write back */
+    neoscrypt_copy(output, S, output_size);
+}
+
+
+#define FASTKDF_BUFFER_SIZE 256U
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+static void neoscrypt_fastkdf(const uchar *password, uint password_len,
+							  const uchar *salt, uint salt_len,
+							  uint N, uchar *output, uint output_len) {
+    const uint stack_align = 0x40, kdf_buf_size = FASTKDF_BUFFER_SIZE,
+			prf_input_size = BLAKE2S_BLOCK_SIZE, prf_key_size = BLAKE2S_KEY_SIZE,
+			prf_output_size = BLAKE2S_OUT_SIZE;
+    uint bufptr, a, b, i, j;
+    uchar *A, *B, *prf_input, *prf_key, *prf_output;
+
+    /* Align and set up the buffers in stack */
+    uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
+    A          = &stack[stack_align & ~(stack_align - 1)];
+    B          = &A[kdf_buf_size + prf_input_size];
+    prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size];
+
+    /* Initialise the password buffer */
+    if(password_len > kdf_buf_size)
+       password_len = kdf_buf_size;
+
+    a = kdf_buf_size / password_len;
+    for(i = 0; i < a; i++)
+      neoscrypt_copy(&A[i * password_len], &password[0], password_len);
+    b = kdf_buf_size - a * password_len;
+    if(b)
+      neoscrypt_copy(&A[a * password_len], &password[0], b);
+    neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size);
+
+    /* Initialise the salt buffer */
+    if(salt_len > kdf_buf_size)
+       salt_len = kdf_buf_size;
+
+    a = kdf_buf_size / salt_len;
+    for(i = 0; i < a; i++)
+      neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len);
+    b = kdf_buf_size - a * salt_len;
+    if(b)
+      neoscrypt_copy(&B[a * salt_len], &salt[0], b);
+    neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size);
+
+    /* The primary iteration */
+    for(i = 0, bufptr = 0; i < N; i++) {
+
+        /* Map the PRF input buffer */
+        prf_input = &A[bufptr];
+
+        /* Map the PRF key buffer */
+        prf_key = &B[bufptr];
+
+        /* PRF */
+        neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size);
+
+        /* Calculate the next buffer pointer */
+        for(j = 0, bufptr = 0; j < prf_output_size; j++)
+          bufptr += prf_output[j];
+        bufptr &= (kdf_buf_size - 1);
+        /* Modify the salt buffer */
+        neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size);
+
+        /* Head modified, tail updated */
+        if(bufptr < prf_key_size)
+          neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr));
+
+        /* Tail modified, head updated */
+        if((kdf_buf_size - bufptr) < prf_output_size)
+          neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr));
+    }
+
+    /* Modify and copy into the output buffer */
+    if(output_len > kdf_buf_size)
+       output_len = kdf_buf_size;
+
+    a = kdf_buf_size - bufptr;
+    if(a >= output_len) {
+        neoscrypt_xor(&B[bufptr], &A[0], output_len);
+        neoscrypt_copy(&output[0], &B[bufptr], output_len);
+    } else {
+        neoscrypt_xor(&B[bufptr], &A[0], a);
+        neoscrypt_xor(&B[0], &A[a], output_len - a);
+        neoscrypt_copy(&output[0], &B[bufptr], a);
+        neoscrypt_copy(&output[a], &B[0], output_len - a);
+    }
+
+}
+
+
+/* Configurable optimised block mixer */
+static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) {
+    uint i, mixer, rounds;
+
+    mixer  = mixmode >> 8;
+    rounds = mixmode & 0xFF;
+
+    /* NeoScrypt flow:                   Scrypt flow:
+         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+         Xa" = Ya; Xb" = Yc;
+         Xc" = Yb; Xd" = Yd; */
+
+    if(r == 1) {
+        neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[0], rounds);
+        else
+          neoscrypt_salsa(&X[0], rounds);
+        neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16], rounds);
+        else
+          neoscrypt_salsa(&X[16], rounds);
+        return;
+    }
+
+    if(r == 2) {
+        neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[0], rounds);
+        else
+          neoscrypt_salsa(&X[0], rounds);
+        neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16], rounds);
+        else
+          neoscrypt_salsa(&X[16], rounds);
+        neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[32], rounds);
+        else
+          neoscrypt_salsa(&X[32], rounds);
+        neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[48], rounds);
+        else
+          neoscrypt_salsa(&X[48], rounds);
+        neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE);
+        return;
+    }
+
+    /* Reference code for any reasonable r */
+    for(i = 0; i < 2 * r; i++) {
+        if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE);
+        else  neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16 * i], rounds);
+        else
+          neoscrypt_salsa(&X[16 * i], rounds);
+        neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE);
+    }
+    for(i = 0; i < r; i++)
+      neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE);
+    for(i = 0; i < r; i++)
+      neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE);
+}
+
+/* NeoScrypt core engine:
+ * p = 1, salt = password;
+ * Basic customisation (required):
+ *   profile bit 0:
+ *     0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
+ *     1 = Scrypt(1024, 1, 1) with Salsa20/8;
+ *   profile bits 4 to 1:
+ *     0000 = FastKDF-BLAKE2s;
+ *     0001 = PBKDF2-HMAC-SHA256;
+ *     0010 = PBKDF2-HMAC-BLAKE256;
+ * Extended customisation (optional):
+ *   profile bit 31:
+ *     0 = extended customisation absent;
+ *     1 = extended customisation present;
+ *   profile bits 7 to 5 (rfactor):
+ *     000 = r of 1;
+ *     001 = r of 2;
+ *     010 = r of 4;
+ *     ...
+ *     111 = r of 128;
+ *   profile bits 12 to 8 (Nfactor):
+ *     00000 = N of 2;
+ *     00001 = N of 4;
+ *     00010 = N of 8;
+ *     .....
+ *     00110 = N of 128;
+ *     .....
+ *     01001 = N of 1024;
+ *     .....
+ *     11110 = N of 2147483648;
+ *   profile bits 30 to 13 are reserved */
+void neoscrypt(const uchar *password, uchar *output, uint profile) {
+    uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40;
+    uint kdf, i, j;
+    uint *X, *Y, *Z, *V;
+
+    if(profile & 0x1) {
+        N = 1024;        /* N = (1 << (Nfactor + 1)); */
+        r = 1;           /* r = (1 << rfactor); */
+        dblmix = 0;      /* Salsa only */
+        mixmode = 0x08;  /* 8 rounds */
+    }
+
+    if(profile >> 31) {
+        N = (1 << (((profile >> 8) & 0x1F) + 1));
+        r = (1 << ((profile >> 5) & 0x7));
+    }
+
+    uchar *stack;
+    stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align);
+
+    /* X = r * 2 * SCRYPT_BLOCK_SIZE */
+    X = (uint *) &stack[stack_align & ~(stack_align - 1)];
+    /* Z is a copy of X for ChaCha */
+    Z = &X[32 * r];
+    /* Y is an X sized temporal space */
+    Y = &X[64 * r];
+    /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */
+    V = &X[96 * r];
+
+    /* X = KDF(password, salt) */
+    kdf = (profile >> 1) & 0xF;
+
+    switch(kdf) {
+
+        default:
+        case(0x0):
+            neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+            break;
+
+#if (SHA256)
+        case(0x1):
+            neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+            break;
+#endif
+
+#if (BLAKE256)
+        case(0x2):
+            neoscrypt_pbkdf2_blake256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+            break;
+#endif
+
+    }
+
+    /* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2; otherwise Salsa only */
+
+    if(dblmix) {
+        /* blkcpy(Z, X) */
+        neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+        /* Z = SMix(Z) */
+        for(i = 0; i < N; i++) {
+            /* blkcpy(V, Z) */
+            neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+            /* blkmix(Z, Y) */
+            neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+        }
+        for(i = 0; i < N; i++) {
+            /* integerify(Z) mod N */
+            j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1));
+            /* blkxor(Z, V) */
+            neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+            /* blkmix(Z, Y) */
+            neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+        }
+    }
+
+#if (ASM)
+    /* Must be called before and after SSE2 Salsa */
+    neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+    /* X = SMix(X) */
+    for(i = 0; i < N; i++) {
+        /* blkcpy(V, X) */
+        neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+        /* blkmix(X, Y) */
+        neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+    }
+    for(i = 0; i < N; i++) {
+        /* integerify(X) mod N */
+        j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1));
+        /* blkxor(X, V) */
+        neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+        /* blkmix(X, Y) */
+        neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+    }
+
+#if (ASM)
+    neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+    if(dblmix)
+      /* blkxor(X, Z) */
+      neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+    /* output = KDF(password, X) */
+    switch(kdf) {
+
+        default:
+        case(0x0):
+            neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32);
+            break;
+
+#if (SHA256)
+        case(0x1):
+            neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
+            break;
+#endif
+
+#if (BLAKE256)
+        case(0x2):
+            neoscrypt_pbkdf2_blake256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
+            break;
+#endif
+
+    }
+
+    free(stack);
+}
+
+void neoscrypt_regenhash(struct work *work)
+{
+	neoscrypt(work->data, work->hash, 0x80000620);
+}
+
+#if (NEOSCRYPT_TEST)
+
+#include <stdio.h>
+
+int main() {
+    uint prf_input_len = 64, prf_key_len = 32, prf_output_len = 32;
+    uint kdf_input_len = 80, kdf_output_len = 256, N = 32;
+    uint neoscrypt_output_len = 32;
+    uchar input[kdf_input_len], output[kdf_output_len];
+    uint i;
+    ubool fail;
+
+    for(i = 0; i < kdf_input_len; i++) {
+        input[i] = i;
+    }
+
+    neoscrypt_blake2s(input, prf_input_len, input, prf_key_len, output, prf_output_len);
+
+    uchar blake2s_ref[32] = {
+        0x89, 0x75, 0xB0, 0x57, 0x7F, 0xD3, 0x55, 0x66,
+        0xD7, 0x50, 0xB3, 0x62, 0xB0, 0x89, 0x7A, 0x26,
+        0xC3, 0x99, 0x13, 0x6D, 0xF0, 0x7B, 0xAB, 0xAB,
+        0xBD, 0xE6, 0x20, 0x3F, 0xF2, 0x95, 0x4E, 0xD4 };
+
+    for(i = 0, fail = 0; i < prf_output_len; i++) {
+        if(output[i] != blake2s_ref[i]) {
+            fail = 1;
+            break;
+        }
+    }
+
+    if(fail) {
+        printf("BLAKE2s integrity test failed!\n");
+        return(1);
+    } else {
+        printf("BLAKE2s integrity test passed.\n");
+    }
+
+    neoscrypt_fastkdf(input, kdf_input_len, input, kdf_input_len, N, output, kdf_output_len);
+
+    uchar fastkdf_ref[256] = {
+        0xCC, 0xBC, 0x19, 0x71, 0xEC, 0x44, 0xE3, 0x17,
+        0xB3, 0xC9, 0xDE, 0x16, 0x76, 0x02, 0x60, 0xB8,
+        0xE2, 0xD4, 0x79, 0xB6, 0x88, 0xCA, 0xB5, 0x4A,
+        0xCF, 0x6E, 0x0E, 0x9A, 0xAE, 0x48, 0x78, 0x12,
+        0xA1, 0x95, 0x1E, 0xE1, 0xD1, 0x0A, 0xC2, 0x94,
+        0x1F, 0x0A, 0x39, 0x73, 0xFE, 0xA4, 0xCD, 0x87,
+        0x4B, 0x38, 0x54, 0x72, 0xB5, 0x53, 0xC3, 0xEA,
+        0xC1, 0x26, 0x8D, 0xA7, 0xFF, 0x3F, 0xC1, 0x79,
+        0xA6, 0xFF, 0x96, 0x54, 0x29, 0x05, 0xC0, 0x22,
+        0x90, 0xDB, 0x53, 0x87, 0x2D, 0x29, 0x00, 0xA6,
+        0x14, 0x16, 0x38, 0x63, 0xDA, 0xBC, 0x0E, 0x99,
+        0x68, 0xB3, 0x98, 0x92, 0x42, 0xE3, 0xF6, 0xB4,
+        0x19, 0xE3, 0xE3, 0xF6, 0x8E, 0x67, 0x47, 0x7B,
+        0xB6, 0xFB, 0xEA, 0xCE, 0x6D, 0x0F, 0xAF, 0xF6,
+        0x19, 0x43, 0x8D, 0xF7, 0x3E, 0xB5, 0xFB, 0xA3,
+        0x64, 0x5E, 0xD2, 0x72, 0x80, 0x6B, 0x39, 0x93,
+        0xB7, 0x80, 0x04, 0xCB, 0xF5, 0xC2, 0x61, 0xB1,
+        0x90, 0x4E, 0x2B, 0x02, 0x57, 0x53, 0x77, 0x16,
+        0x6A, 0x52, 0xBD, 0xD1, 0x62, 0xEC, 0xA1, 0xCB,
+        0x89, 0x03, 0x29, 0xA2, 0x02, 0x5C, 0x9A, 0x62,
+        0x99, 0x44, 0x54, 0xEA, 0x44, 0x91, 0x27, 0x3A,
+        0x50, 0x82, 0x62, 0x03, 0x99, 0xB3, 0xFA, 0xF7,
+        0xD4, 0x13, 0x47, 0x61, 0xFB, 0x0A, 0xE7, 0x81,
+        0x61, 0x57, 0x58, 0x4C, 0x69, 0x4E, 0x67, 0x0A,
+        0xC1, 0x21, 0xA7, 0xD2, 0xF6, 0x6D, 0x2F, 0x10,
+        0x01, 0xFB, 0xA5, 0x47, 0x2C, 0xE5, 0x15, 0xD7,
+        0x6A, 0xEF, 0xC9, 0xE2, 0xC2, 0x88, 0xA2, 0x3B,
+        0x6C, 0x8D, 0xBB, 0x26, 0xE7, 0xC4, 0x15, 0xEC,
+        0x5E, 0x5D, 0x74, 0x79, 0xBD, 0x81, 0x35, 0xA1,
+        0x42, 0x27, 0xEB, 0x57, 0xCF, 0xF6, 0x2E, 0x51,
+        0x90, 0xFD, 0xD9, 0xE4, 0x53, 0x6E, 0x12, 0xA1,
+        0x99, 0x79, 0x4D, 0x29, 0x6F, 0x5B, 0x4D, 0x9A };
+
+    for(i = 0, fail = 0; i < kdf_output_len; i++) {
+        if(output[i] != fastkdf_ref[i]) {
+            fail = 1;
+            break;
+        }
+    }
+
+    if(fail) {
+        printf("FastKDF integrity test failed!\n");
+        return(1);
+    } else {
+        printf("FastKDF integrity test passed.\n");
+    }
+
+    neoscrypt(input, output, 0x80000620);
+
+    uchar neoscrypt_ref[32] = {
+        0x72, 0x58, 0x96, 0x1A, 0xFB, 0x33, 0xFD, 0x12,
+        0xD0, 0x0C, 0xAC, 0xB8, 0xD6, 0x3F, 0x4F, 0x4F,
+        0x52, 0xBB, 0x69, 0x17, 0x04, 0x38, 0x65, 0xDD,
+        0x24, 0xA0, 0x8F, 0x57, 0x88, 0x53, 0x12, 0x2D };
+
+    for(i = 0, fail = 0; i < neoscrypt_output_len; i++) {
+        if(output[i] != neoscrypt_ref[i]) {
+            fail = 1;
+            break;
+        }
+    }
+
+    if(fail) {
+        printf("NeoScrypt integrity test failed!\n");
+        return(1);
+    } else {
+        printf("NeoScrypt integrity test passed.\n");
+    }
+
+    return(0);
+}
+
+#endif
\ No newline at end of file
diff --git a/algorithm/neoscrypt.h b/algorithm/neoscrypt.h
new file mode 100644
index 000000000..5337dac8a
--- /dev/null
+++ b/algorithm/neoscrypt.h
@@ -0,0 +1,13 @@
+#ifndef NEOSCRYPT_H
+#define NEOSCRYPT_H
+  
+#include "miner.h"
+
+/* The neoscrypt scratch buffer needs 32kBytes memory. */
+#define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024)
+
+/* These routines are always available. */
+extern void neoscrypt_regenhash(struct work *work);
+extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
+
+#endif /* NEOSCRYPT_H */
\ No newline at end of file
diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl
new file mode 100644
index 000000000..7939d7ed2
--- /dev/null
+++ b/kernel/neoscrypt.cl
@@ -0,0 +1,525 @@
+/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
+/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */
+
+// Stupid AMD compiler ignores the unroll pragma in these two
+#define SALSA_SMALL_UNROLL 3
+#define CHACHA_SMALL_UNROLL 3
+
+// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
+// as the unroll factor; must divide cleanly into ten.
+// Usually a bad idea.
+//#define SMALL_BLAKE2S
+//#define BLAKE2S_UNROLL 5
+
+#define BLOCK_SIZE           64U
+#define FASTKDF_BUFFER_SIZE 256U
+#ifndef PASSWORD_LEN
+#define PASSWORD_LEN         80U
+#endif
+
+#if !defined(cl_khr_byte_addressable_store)
+#error "Device does not support unaligned stores"
+#endif
+
+// Swaps 128 bytes at a time without using temp vars
+void SwapBytes128(void *restrict A, void *restrict B, uint len)
+{
+	#pragma unroll 2
+	for(int i = 0; i < (len >> 7); ++i)
+	{
+		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
+		((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
+		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
+	}
+}
+
+void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
+{
+	#pragma unroll 2
+    for(int i = 0; i < len; ++i)
+		((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
+}
+
+void CopyBytes(void *restrict dst, const void *restrict src, uint len)
+{
+    for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] = ((uchar *)src)[i];
+}
+
+void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len)
+{
+	for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] ^= ((uchar *)src)[i];
+}
+
+void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
+{
+	#pragma unroll 1
+	for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
+}
+
+// Blake2S
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+static const __constant uint BLAKE2S_IV[8] =
+{
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const __constant uchar BLAKE2S_SIGMA[10][16] =
+{
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key)	do { \
+	a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
+	d = rotate(d ^ a, 16U); \
+	c += d; \
+	b = rotate(b ^ c, 20U); \
+	a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
+	d = rotate(d ^ a, 24U); \
+	c += d; \
+	b = rotate(b ^ c, 25U); \
+} while(0)
+
+void Blake2S(uint *restrict inout, const uint *restrict inkey)
+{
+	uint16 V;
+	uint8 tmpblock;
+
+	// Load first block (IV into V.lo) and constants (IV into V.hi)
+	V.lo = V.hi = vload8(0U, BLAKE2S_IV);
+
+	// XOR with initial constant
+	V.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	// XOR length of message so far (including this block)
+	// There are two uints for this field, but high uint is zero
+	V.sc ^= BLAKE2S_BLOCK_SIZE;
+
+	// Compress state, using the key as the key
+	#ifdef SMALL_BLAKE2S
+	#pragma unroll BLAKE2S_UNROLL
+	#else
+	#pragma unroll
+	#endif
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
+		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
+		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
+		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
+		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
+		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
+		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
+	}
+
+	// XOR low part of state with the high part,
+	// then with the original input block.
+	V.lo ^= V.hi ^ tmpblock;
+
+	// Load constants (IV into V.hi)
+	V.hi = vload8(0U, BLAKE2S_IV);
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	// XOR length of message into block again
+	V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
+
+	// Last block compression - XOR final constant into state
+	V.se ^= 0xFFFFFFFFU;
+
+	// Compress block, using the input as the key
+	#ifdef SMALL_BLAKE2S
+	#pragma unroll BLAKE2S_UNROLL
+	#else
+	#pragma unroll
+	#endif
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
+		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
+		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
+		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
+		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
+		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
+		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
+	}
+
+	// XOR low part of state with high part, then with input block
+	V.lo ^= V.hi ^ tmpblock;
+
+	// Store result in input/output buffer
+	vstore8(V.lo, 0, inout);
+}
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len)
+{
+
+	/*                    WARNING!
+	 * This algorithm uses byte-wise addressing for memory blocks.
+	 * Or in other words, trying to copy an unaligned memory region
+	 * will significantly slow down the algorithm, when copying uses
+	 * words or bigger entities. It even may corrupt the data, when
+	 * the device does not support it properly.
+	 * Therefore use byte copying, which will not the fastest but at
+	 * least get reliable results. */
+
+	// BLOCK_SIZE            64U
+	// FASTKDF_BUFFER_SIZE  256U
+	// BLAKE2S_BLOCK_SIZE    64U
+	// BLAKE2S_KEY_SIZE      32U
+	// BLAKE2S_OUT_SIZE      32U
+	uchar bufidx = 0;
+	uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
+	uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
+
+	// Initialize the password buffer
+	#pragma unroll 1
+	for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
+
+	((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
+
+	// Initialize the salt buffer
+	if(salt_len == FASTKDF_BUFFER_SIZE)
+	{
+		((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
+		((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
+	}
+	else
+	{
+		// salt_len is 80 bytes here
+		#pragma unroll 1
+		for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10];
+
+		// Initialized the rest to zero earlier
+		#pragma unroll 1
+		for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i];
+	}
+
+    // The primary iteration
+    #pragma unroll 1
+    for(int i = 0; i < 32; ++i)
+    {
+		// Make the key buffer twice the size of the key so it fits a Blake2S block
+		// This way, we don't need a temp buffer in the Blake2S function.
+		uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };
+
+		// Copy input and key to their buffers
+		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
+		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
+
+        // PRF
+        Blake2S((uint *)input, (uint *)key);
+
+        // Calculate the next buffer pointer
+		bufidx = 0;
+
+		for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
+			bufidx += input[x];
+
+		// bufidx a uchar now - always mod 255
+		//bufidx &= (FASTKDF_BUFFER_SIZE - 1);
+
+        // Modify the salt buffer
+		XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
+
+		if(bufidx < BLAKE2S_KEY_SIZE)
+		{
+			// Head modified, tail updated
+			// this was made off the original code... wtf
+			//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
+			CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
+		}
+		else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
+		{
+			// Tail modified, head updated
+			CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
+		}
+    }
+
+    // Modify and copy into the output buffer
+
+    // Damned compiler crashes
+    // Fuck you, AMD
+
+	//for(uint i = 0; i < output_len; ++i, ++bufidx)
+	//	output[i] = B[bufidx] ^ A[i];
+
+    uint left = FASTKDF_BUFFER_SIZE - bufidx;
+	//uint left = (~bufidx) + 1
+
+	if(left < output_len)
+	{
+		XORBytes(output, B + bufidx, A, left);
+		XORBytes(output + left, B, A + left, output_len - left);
+	}
+	else
+	{
+		XORBytes(output, B + bufidx, A, output_len);
+	}
+}
+
+#define SALSA_CORE(state)	do { \
+	state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
+	state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
+	state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
+	state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \
+	state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \
+	state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \
+	state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \
+	state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
+} while(0)
+
+uint16 salsa_small_scalar_rnd(uint16 X)
+{
+	uint16 st = X;
+
+	#if SALSA_SMALL_UNROLL == 1
+
+	for(int i = 0; i < 10; ++i)
+	{
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 2
+
+	for(int i = 0; i < 5; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 3
+
+	for(int i = 0; i < 4; ++i)
+	{
+		SALSA_CORE(st);
+		if(i == 3) break;
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 4
+
+	for(int i = 0; i < 3; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		if(i == 2) break;
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#else
+
+	for(int i = 0; i < 2; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#endif
+
+	return(X + st);
+}
+
+#define CHACHA_CORE_PARALLEL(state)	do { \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \
+	\
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \
+} while(0)
+
+uint16 chacha_small_parallel_rnd(uint16 X)
+{
+	uint4 t, st[4];
+
+	((uint16 *)st)[0] = X;
+
+	#if CHACHA_SMALL_UNROLL == 1
+
+	for(int i = 0; i < 10; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 2
+
+	for(int i = 0; i < 5; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 3
+
+	for(int i = 0; i < 4; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		if(i == 3) break;
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 4
+
+	for(int i = 0; i < 3; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		if(i == 2) break;
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#else
+
+	for(int i = 0; i < 2; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#endif
+
+	return(X + ((uint16 *)st)[0]);
+}
+
+void neoscrypt_blkmix(uint16 *XV, bool alg)
+{
+
+    /* NeoScrypt flow:                   Scrypt flow:
+         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+         Xa" = Ya; Xb" = Yc;
+         Xc" = Yb; Xd" = Yd; */
+
+	XV[0] ^= XV[3];
+
+	if(!alg)
+	{
+		XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
+		XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
+		XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
+		XV[3] = salsa_small_scalar_rnd(XV[3]);
+	}
+	else
+	{
+		XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
+		XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
+		XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
+		XV[3] = chacha_small_parallel_rnd(XV[3]);
+	}
+
+	XV[1] ^= XV[2];
+	XV[2] ^= XV[1];
+	XV[1] ^= XV[2];
+}
+
+void ScratchpadStore(__global void *V, void *X, uchar idx)
+{
+	((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0];
+	((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1];
+}
+
+void ScratchpadMix(void *X, const __global void *V, uchar idx)
+{
+	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1];
+	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1];
+}
+
+void SMix(uint16 *X, __global uint16 *V, bool flag)
+{
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		ScratchpadStore(V, X, i);
+		neoscrypt_blkmix(X, flag);
+	}
+
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
+		ScratchpadMix(X, V, idx);
+		neoscrypt_blkmix(X, flag);
+	}
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
+{
+#define CONSTANT_N 128
+#define CONSTANT_r 2
+	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
+	uint16 X[4], Z[4];
+	/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
+	__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	uchar outbuf[32];
+	uchar data[PASSWORD_LEN];
+
+	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
+	((ulong *)data)[8] = ((__global const ulong *)input)[8];
+	((uint *)data)[18] = ((__global const uint *)input)[18];
+	((uint *)data)[19] = get_global_id(0);
+
+    // X = KDF(password, salt)
+	fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
+
+    // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
+    CopyBytes128(Z, X, 2);
+
+    // X = SMix(X); X & Z are swapped, repeat.
+    for(bool flag = false;; ++flag)
+    {
+		SMix(X, V, flag);
+		if(flag) break;
+		SwapBytes128(X, Z, 256);
+	}
+
+	// blkxor(X, Z)
+	((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
+	((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
+
+	// output = KDF(password, X)
+	fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
+	if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
+}
\ No newline at end of file
diff --git a/miner.h b/miner.h
index c8190bd57..0a366e7f9 100644
--- a/miner.h
+++ b/miner.h
@@ -1100,6 +1100,7 @@ extern pthread_cond_t restart_cond;
 extern void clear_stratum_shares(struct pool *pool);
 extern void clear_pool_work(struct pool *pool);
 extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2);
+extern void set_target_neoscrypt(unsigned char *target, double diff);
 
 extern void kill_work(void);
 
diff --git a/ocl.c b/ocl.c
index ea458143b..8be317a5c 100644
--- a/ocl.c
+++ b/ocl.c
@@ -34,6 +34,7 @@
 #include "ocl.h"
 #include "ocl/build_kernel.h"
 #include "ocl/binary_kernel.h"
+#include "algorithm/neoscrypt.h"
 
 /* FIXME: only here for global config vars, replace with configuration.h
  * or similar as soon as config is in a struct instead of littered all
@@ -344,19 +345,55 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     cgpu->lookup_gap = 2;
   }
 
-  if (!cgpu->opt_tc) {
+  // neoscrypt calculates TC differently
+  if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) {
+    int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
+    size_t glob_thread_count = 1UL << max_int;
+
+    // if TC is entered by user, use that value... otherwise use default
+    cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count));
+
+    // if TC * scratchbuf size is too big for memory... reduce to max
+    if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) {
+      /* Selected intensity will not run on this GPU. Not enough memory.
+      * Adapt the memory setting. */
+      glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
+
+      /* Find highest significant bit in glob_thread_count, which gives
+      * the intensity. */
+      while (max_int && ((1U << max_int) & glob_thread_count) == 0) {
+        --max_int;
+      }
+
+      /* Check if max_intensity is >0. */
+      if (max_int < MIN_INTENSITY) {
+        applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
+        max_int = MIN_INTENSITY;
+      }
+
+      cgpu->intensity = max_int;
+      cgpu->thread_concurrency = 1U << max_int;
+    }
+
+    applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
+
+  }
+  else if (!cgpu->opt_tc) {
     unsigned int sixtyfours;
 
     sixtyfours =  cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
     cgpu->thread_concurrency = sixtyfours * 64;
     if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
       cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
-      if (cgpu->thread_concurrency > cgpu->shaders * 5)
+      if (cgpu->thread_concurrency > cgpu->shaders * 5) {
         cgpu->thread_concurrency = cgpu->shaders * 5;
+      }
     }
     applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
-  } else
+  }
+  else {
     cgpu->thread_concurrency = cgpu->opt_tc;
+  }
 
 
   cl_uint slot, cpnd;
@@ -445,17 +482,36 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   }
 
   size_t bufsize;
+  size_t readbufsize = 128;
 
   if (algorithm->rw_buffer_size < 0) {
-    size_t ipt = (algorithm->n / cgpu->lookup_gap +
-            (algorithm->n % cgpu->lookup_gap > 0));
-    bufsize = 128 * ipt * cgpu->thread_concurrency;
-  } else
-    bufsize = (size_t) algorithm->rw_buffer_size;
+    // calc buffer size for neoscrypt
+    if (!safe_cmp(algorithm->name, "neoscrypt")) {
+      /* The scratch/pad-buffer needs 32kBytes memory per thread. */
+      bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
+
+      /* This is the input buffer. For neoscrypt this is guaranteed to be
+      * 80 bytes only. */
+      readbufsize = 80;
+
+      applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+      // scrypt/n-scrypt
+    }
+    else {
+      size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
+      bufsize = 128 * ipt * cgpu->thread_concurrency;
+      applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+    }
+  }
+  else {
+    bufsize = (size_t)algorithm->rw_buffer_size;
+    applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+  }
 
   clState->padbuffer8 = NULL;
 
   if (bufsize > 0) {
+    applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize);
     /* Use the max alloc value which has been rounded to a power of
      * 2 greater >= required amount earlier */
     if (bufsize > cgpu->max_alloc) {
@@ -463,7 +519,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
            gpu, (unsigned long)(cgpu->max_alloc));
       applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
     }
-    applog(LOG_DEBUG, "Creating buffer sized %lu", (unsigned long)bufsize);
 
     /* This buffer is weird and might work to some degree even if
      * the create buffer call has apparently failed, so check if we
@@ -475,11 +530,14 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     }
   }
 
-  clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
+  applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize);
+  clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, readbufsize, NULL, &status);
   if (status != CL_SUCCESS) {
     applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
     return NULL;
   }
+  
+  applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE);
   clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
 
   if (status != CL_SUCCESS) {
diff --git a/sgminer.c b/sgminer.c
index 0ad3fe691..202de0b92 100644
--- a/sgminer.c
+++ b/sgminer.c
@@ -2019,11 +2019,26 @@ static void update_gbt(struct pool *pool)
 /* Return the work coin/network difficulty */
 static double get_work_blockdiff(const struct work *work)
 {
-  uint8_t pow = work->data[72];
-  int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));
-  uint32_t diff32 = be32toh(*((uint32_t *)(work->data + 72))) & 0x00FFFFFF;
-  double numerator = work->pool->algorithm.diff_numerator << powdiff;
-  return numerator / (double)diff32;
+  uint64_t diff64;
+  double numerator;
+
+  // Neoscrypt has the data reversed
+  if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+    diff64 = bswap_64(((uint64_t)(be32toh(*((uint32_t *)(work->data + 72))) & 0xFFFFFF00)) << 8);
+    numerator = (double)work->pool->algorithm.diff_numerator;
+  }
+  else {
+    uint8_t pow = work->data[72];
+    int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));;
+    diff64 = be32toh(*((uint32_t *)(work->data + 72))) & 0x0000000000FFFFFF;
+    numerator = work->pool->algorithm.diff_numerator << powdiff;
+  }
+
+  if (unlikely(!diff64)) {
+    diff64 = 1;
+  }
+
+  return numerator / (double)diff64;
 }
 
 static void gen_gbt_work(struct pool *pool, struct work *work)
@@ -2073,7 +2088,10 @@ static void gen_gbt_work(struct pool *pool, struct work *work)
     free(header);
   }
 
-  calc_midstate(work);
+  // Neoscrypt doesn't calc_midstate()
+  if (safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    calc_midstate(work);
+  }
   local_work++;
   work->pool = pool;
   work->gbt = true;
@@ -2189,10 +2207,15 @@ static bool getwork_decode(json_t *res_val, struct work *work)
     return false;
   }
 
-  if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) {
-    // Calculate it ourselves
-    applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), ""));
-    calc_midstate(work);
+  // Neoscrypt doesn't calc midstate
+  if (safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+    if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) {
+      // Calculate it ourselves
+      if (opt_morenotices) {
+        applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), ""));
+      }
+      calc_midstate(work);
+    }
   }
 
   if (unlikely(!jobj_binary(res_val, "target", work->target, sizeof(work->target), true))) {
@@ -2936,8 +2959,8 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
 
   endian_flip128(work->data, work->data);
 
-  /* build hex string */
-  hexstr = bin2hex(work->data, sizeof(work->data));
+  /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */
+  hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data)));
 
   /* build JSON-RPC request */
   if (work->gbt) {
@@ -3304,11 +3327,19 @@ static void calc_diff(struct work *work, double known)
 
     d64 = work->pool->algorithm.diff_multiplier2 * truediffone;
 
-    dcut64 = le256todouble(work->target);
+    applog(LOG_DEBUG, "calc_diff() algorithm = %s", work->pool->algorithm.name);
+    // Neoscrypt
+    if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+      dcut64 = (double)*((uint64_t *)(work->target + 22));
+    }
+    else {
+      dcut64 = le256todouble(work->target);  
+    }
     if (unlikely(!dcut64))
       dcut64 = 1;
     work->work_difficulty = d64 / dcut64;
   }
+
   difficulty = work->work_difficulty;
 
   pool_stats->last_diff = difficulty;
@@ -5465,8 +5496,21 @@ static void *stratum_sthread(void *userdata)
     sshare->sshare_time = time(NULL);
     /* This work item is freed in parse_stratum_response */
     sshare->work = work;
-    nonce = *((uint32_t *)(work->data + 76));
+
+    applog(LOG_DEBUG, "stratum_sthread() algorithm = %s", pool->algorithm.name);
+
+    // Neoscrypt is little endian
+    if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+      nonce = htobe32(*((uint32_t *)(work->data + 76)));
+      //*((uint32_t *)nonce2) = htole32(work->nonce2);
+    }
+    else {
+      nonce = *((uint32_t *)(work->data + 76));
+    }
     __bin2hex(noncehex, (const unsigned char *)&nonce, 4);
+
+    *((uint64_t *)nonce2) = htole64(work->nonce2);
+    __bin2hex(nonce2hex, nonce2, work->nonce2_len);
     memset(s, 0, 1024);
 
     mutex_lock(&sshare_lock);
@@ -5474,10 +5518,6 @@ static void *stratum_sthread(void *userdata)
     sshare->id = swork_id++;
     mutex_unlock(&sshare_lock);
 
-    nonce2_64 = (uint64_t *)nonce2;
-    *nonce2_64 = htole64(work->nonce2);
-    __bin2hex(nonce2hex, nonce2, work->nonce2_len);
-
     snprintf(s, sizeof(s),
       "{\"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\": %d, \"method\": \"mining.submit\"}",
       pool->rpc_user, work->job_id, nonce2hex, work->ntime, noncehex, sshare->id);
@@ -5885,6 +5925,50 @@ void set_target(unsigned char *dest_target, double diff, double diff_multiplier2
   memcpy(dest_target, target, 32);
 }
 
+/*****************************************************
+* Special set_target() function for Neoscrypt
+****************************************************/
+void set_target_neoscrypt(unsigned char *target, double diff)
+{
+  uint64_t m;
+  int k;
+
+  diff /= 65536.0;
+  for (k = 6; k > 0 && diff > 1.0; --k) {
+    diff /= 4294967296.0;
+  }
+
+  m = 4294901760.0 / diff;
+
+  if (m == 0 && k == 6) {
+    memset(target, 0xff, 32);
+  }
+  else {
+    memset(target, 0, 32);
+    ((uint32_t *)target)[k] = (uint32_t)m;
+    ((uint32_t *)target)[k + 1] = (uint32_t)(m >> 32);
+  }
+
+  if (opt_debug) {
+    /* The target is computed in this systems endianess and stored
+    * in its endianess on a uint32-level. But because the target are
+    * eight uint32s, they are stored in mixed mode, i.e., each uint32
+    * is stored in the local endianess, but the least significant bit
+    * is stored in target[0] bit 0.
+    *
+    * To print this large number in a native human readable form the
+    * order of the array entries is swapped, i.e., target[7] <-> target[0]
+    * and each array entry is byte swapped to have the least significant
+    * bit to the right. */
+    uint32_t swaped[8];
+    swab256(swaped, target);
+    char *htarget = bin2hex((unsigned char *)swaped, 32);
+
+    applog(LOG_DEBUG, "Generated neoscrypt target 0x%s", htarget);
+    free(htarget);
+  }
+}
+
 /* Generates stratum based work based on the most recent notify information
  * from the pool. This will keep generating work while a pool is down so we use
  * other means to detect when the pool has died in stratum_thread */
@@ -5893,12 +5977,12 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
   unsigned char merkle_root[32], merkle_sha[64];
   uint32_t *data32, *swap32;
   uint64_t nonce2le;
-  int i;
+  int i, j;
 
   cg_wlock(&pool->data_lock);
 
   /* Update coinbase. Always use an LE encoded nonce2 to fill in values
-   * from left to right and prevent overflow errors with small n2sizes */
+  * from left to right and prevent overflow errors with small n2sizes */
   nonce2le = htole64(pool->nonce2);
   memcpy(pool->coinbase + pool->nonce2_offset, &nonce2le, pool->n2size);
   work->nonce2 = pool->nonce2++;
@@ -5915,16 +5999,50 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
     gen_hash(merkle_sha, 64, merkle_root);
     memcpy(merkle_sha, merkle_root, 32);
   }
-  data32 = (uint32_t *)merkle_sha;
-  swap32 = (uint32_t *)merkle_root;
-  flip32(swap32, data32);
 
-  /* Copy the data template from header_bin */
-  memcpy(work->data, pool->header_bin, 128);
-  memcpy(work->data + pool->merkle_offset, merkle_root, 32);
+  applog(LOG_DEBUG, "gen_stratum_work() - algorithm = %s", pool->algorithm.name);
+
+  // Different for Neoscrypt because of Little Endian
+  if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    /* Incoming data is in little endian. */
+    memcpy(merkle_root, merkle_sha, 32);
+
+    uint32_t temp = pool->merkle_offset / sizeof(uint32_t), i;
+    /* Put version (4 byte) + prev_hash (4 byte* 8) but big endian encoded
+    * into work. */
+    for (i = 0; i < temp; ++i) {
+      ((uint32_t *)work->data)[i] = be32toh(((uint32_t *)pool->header_bin)[i]);
+    }
+
+    /* Now add the merkle_root (4 byte* 8), but it is encoded in little endian. */
+    temp += 8;
+
+    for (j = 0; i < temp; ++i, ++j) {
+      ((uint32_t *)work->data)[i] = le32toh(((uint32_t *)merkle_root)[j]);
+    }
+
+    /* Add the time encoded in big endianess. */
+    hex2bin((unsigned char *)&temp, pool->swork.ntime, 4);
+
+    /* Add the nbits (big endianess). */
+    ((uint32_t *)work->data)[17] = be32toh(temp);
+    hex2bin((unsigned char *)&temp, pool->swork.nbit, 4);
+    ((uint32_t *)work->data)[18] = be32toh(temp);
+    ((uint32_t *)work->data)[20] = 0x80000000;
+    ((uint32_t *)work->data)[31] = 0x00000280;
+  }
+  else {
+    data32 = (uint32_t *)merkle_sha;
+    swap32 = (uint32_t *)merkle_root;
+    flip32(swap32, data32);
+
+    /* Copy the data template from header_bin */
+    memcpy(work->data, pool->header_bin, 128);
+    memcpy(work->data + pool->merkle_offset, merkle_root, 32);
+  }
 
   /* Store the stratum work diff to check it still matches the pool's
-   * stratum diff when submitting shares */
+  * stratum diff when submitting shares */
   work->sdiff = pool->swork.diff;
 
   /* Copy parameters required for share submission */
@@ -5941,13 +6059,19 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
     applog(LOG_DEBUG, "Generated stratum merkle %s", merkle_hash);
     applog(LOG_DEBUG, "Generated stratum header %s", header);
     applog(LOG_DEBUG, "Work job_id %s nonce2 %"PRIu64" ntime %s", work->job_id,
-           work->nonce2, work->ntime);
+      work->nonce2, work->ntime);
     free(header);
     free(merkle_hash);
   }
 
-  calc_midstate(work);
-  set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2);
+  // For Neoscrypt use set_target_neoscrypt() function
+  if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    set_target_neoscrypt(work->target, work->sdiff);
+  }
+  else {
+    calc_midstate(work);
+    set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2);
+  }
 
   local_work++;
   work->pool = pool;
@@ -6124,15 +6248,17 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   unsigned int options = 0;
   const char *opt1, *opt2;
 
-  if(!pool1 || !pool2)
+  applog(LOG_DEBUG, "compare_pool_settings()");
+
+  if (!pool1 || !pool2)
     return 0;
 
   //compare pool devices
-  opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all"));
-  opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all"));
+  opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all"));
+  opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all"));
 
   //changing devices means a hard reset of mining threads
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
     options |= (SWITCHER_APPLY_DEVICE | SWITCHER_HARD_RESET);
 
   //compare gpu threads
@@ -6140,11 +6266,11 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   opt2 = get_pool_setting(pool2->gpu_threads, default_profile.gpu_threads);
 
   //changing gpu threads means a hard reset of mining threads
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
     options |= (SWITCHER_APPLY_GT | SWITCHER_HARD_RESET);
 
   //compare algorithm
-  if(!cmp_algorithm(&pool1->algorithm, &pool2->algorithm))
+  if (!cmp_algorithm(&pool1->algorithm, &pool2->algorithm))
     options |= (SWITCHER_APPLY_ALGO | SWITCHER_SOFT_RESET);
 
   //lookup gap
@@ -6152,46 +6278,46 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   opt2 = get_pool_setting(pool2->lookup_gap, default_profile.lookup_gap);
 
   //lookup gap means soft reset but only if hard reset isnt set
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
     options |= (SWITCHER_APPLY_LG | SWITCHER_SOFT_RESET);
 
   //intensities
   opt1 = get_pool_setting(pool1->rawintensity, default_profile.rawintensity);
   opt2 = get_pool_setting(pool2->rawintensity, default_profile.rawintensity);
 
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
   {
     //intensity is soft reset
-    if(!empty_string(opt2))
+    if (!empty_string(opt2))
       options |= (SWITCHER_APPLY_RAWINT | SWITCHER_SOFT_RESET);
   }
 
   //xintensity -- only if raw intensity not set
-  if(!opt_isset(options, SWITCHER_APPLY_RAWINT))
+  if (!opt_isset(options, SWITCHER_APPLY_RAWINT))
   {
     opt1 = get_pool_setting(pool1->xintensity, default_profile.xintensity);
     opt2 = get_pool_setting(pool2->xintensity, default_profile.xintensity);
 
     //if different...
-    if(strcasecmp(opt1, opt2) != 0)
+    if (strcasecmp(opt1, opt2) != 0)
     {
       //intensity is soft reset
-      if(!empty_string(opt2))
+      if (!empty_string(opt2))
         options |= (SWITCHER_APPLY_XINT | SWITCHER_SOFT_RESET);
     }
   }
 
   //intensity -- only if raw intensity and xintensity not set
-  if(!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT))
+  if (!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT))
   {
     opt1 = get_pool_setting(pool1->intensity, default_profile.intensity);
     opt2 = get_pool_setting(pool2->intensity, default_profile.intensity);
 
     //if different...
-    if(strcasecmp(opt1, opt2) != 0)
+    if (strcasecmp(opt1, opt2) != 0)
     {
       //intensity is soft reset
-      if(!empty_string(opt2))
+      if (!empty_string(opt2))
         options |= (SWITCHER_APPLY_INT | SWITCHER_SOFT_RESET);
       //if blank, set default profile to intensity 8 and apply
       else
@@ -6203,10 +6329,10 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   opt1 = get_pool_setting(pool1->shaders, default_profile.shaders);
   opt2 = get_pool_setting(pool2->shaders, default_profile.shaders);
 
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
   {
     //shaders is soft reset
-    if(!empty_string(opt2))
+    if (!empty_string(opt2))
       options |= (SWITCHER_APPLY_SHADER | SWITCHER_SOFT_RESET);
   }
 
@@ -6215,7 +6341,7 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   opt2 = get_pool_setting(pool2->thread_concurrency, default_profile.thread_concurrency);
 
   //thread-concurrency is soft reset
-  if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
     options |= (SWITCHER_APPLY_TC | SWITCHER_SOFT_RESET);
 
   //worksize
@@ -6223,45 +6349,45 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
   opt2 = get_pool_setting(pool2->worksize, default_profile.worksize);
 
   //worksize is soft reset
-  if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET);
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET);
 
-  #ifdef HAVE_ADL
-    //gpu-engine
-    opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine);
-    opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine);
+#ifdef HAVE_ADL
+  //gpu-engine
+  opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine);
+  opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine);
 
-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_ENGINE;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_ENGINE;
 
-    //gpu-memclock
-    opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock);
-    opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock);
+  //gpu-memclock
+  opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock);
+  opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock);
 
-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_MEMCLOCK;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_MEMCLOCK;
 
-    //GPU fans
-    opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan);
-    opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan);
+  //GPU fans
+  opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan);
+  opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan);
 
-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_FAN;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_FAN;
 
-    //GPU powertune
-    opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune);
-    opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune);
+  //GPU powertune
+  opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune);
+  opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune);
 
-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_POWERTUNE;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_POWERTUNE;
 
-    //GPU vddc
-    opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc);
-    opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc);
+  //GPU vddc
+  opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc);
+  opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc);
 
-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_VDDC;
-  #endif
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_VDDC;
+#endif
 
   // Remove soft reset if hard reset is set
   if (opt_isset(options, SWITCHER_HARD_RESET) &&
@@ -6281,6 +6407,8 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
 {
   int i;
 
+  applog(LOG_DEBUG, "get_work_prepare_thread()");
+
   //if switcher is disabled
   if(opt_switchmode == SWITCH_OFF)
     return;
@@ -6608,6 +6736,7 @@ struct work *get_work(struct thr_info *thr, const int thr_id)
     }
   }
 
+  applog(LOG_DEBUG, "preparing thread...");
   get_work_prepare_thread(thr, work);
 
   diff_t = time(NULL) - diff_t;
@@ -6700,7 +6829,16 @@ bool test_nonce(struct work *work, uint32_t nonce)
   uint32_t diff1targ;
 
   rebuild_nonce(work, nonce);
-  diff1targ = work->pool->algorithm.diff1targ;
+
+  applog(LOG_DEBUG, "test_nonce() algorithm = %s", work->pool->algorithm.name);
+
+  // for Neoscrypt, the diff1targe value is in work->target
+  if ((work->pool->algorithm.name, "neoscrypt")) {
+    diff1targ = ((uint32_t *)work->target)[7];
+  }
+  else {
+    diff1targ = work->pool->algorithm.diff1targ;
+  }
 
   return (le32toh(*hash_32) <= diff1targ);
 }
diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj
index e6b6ab221..0728c4164 100644
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@@ -263,6 +263,7 @@
     <ClCompile Include="..\algorithm.c" />
     <ClCompile Include="..\algorithm\animecoin.c" />
     <ClCompile Include="..\algorithm\bitblock.c" />
+    <ClCompile Include="..\algorithm\neoscrypt.c" />
     <ClCompile Include="..\algorithm\talkcoin.c" />
     <ClCompile Include="..\algorithm\x14.c" />
     <ClCompile Include="..\algorithm\fresh.c" />
@@ -321,6 +322,7 @@
     <ClInclude Include="..\algorithm.h" />
     <ClInclude Include="..\algorithm\animecoin.h" />
     <ClInclude Include="..\algorithm\bitblock.h" />
+    <ClInclude Include="..\algorithm\neoscrypt.h" />
     <ClInclude Include="..\algorithm\talkcoin.h" />
     <ClInclude Include="..\algorithm\x14.h" />
     <ClInclude Include="..\algorithm\fresh.h" />
diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters
index 68d737807..0ba37eab8 100644
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@@ -197,6 +197,9 @@
     <ClCompile Include="..\algorithm.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\algorithm\neoscrypt.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\adl.h">
@@ -373,6 +376,9 @@
     <ClInclude Include="..\sph\sph_shabal.h">
       <Filter>Header Files\sph</Filter>
     </ClInclude>
+    <ClInclude Include="..\algorithm\neoscrypt.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="README.txt" />