Added explanations to understand the Autoencoder implementation better

kaiwaehner · kaiwaehner · commit ecefa793adbe · 2019-01-03T17:32:52.000+01:00
diff --git a/Python Tensorflow Keras Fraud Detection Autoencoder.ipynb b/Python Tensorflow Keras Fraud Detection Autoencoder.ipynb
@@ -332,10 +332,7 @@
     }
    ],
    "source": [
-    "\n",
-    "\n",
-    "pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud\n",
-    "\n"
+    "pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud"
    ]
   },
   {
@@ -357,16 +354,13 @@
     }
    ],
    "source": [
-    "\n",
-    "\n",
     "#if you don't have an intuitive sense of how imbalanced these two classes are, let's go visual\n",
     "count_classes = pd.value_counts(df['Class'], sort = True)\n",
     "count_classes.plot(kind = 'bar', rot=0)\n",
     "plt.xticks(range(2), LABELS)\n",
     "plt.title(\"Frequency by observation number\")\n",
     "plt.xlabel(\"Class\")\n",
-    "plt.ylabel(\"Number of Observations\");\n",
-    "\n"
+    "plt.ylabel(\"Number of Observations\");"
    ]
   },
   {
@@ -466,8 +460,6 @@
     }
    ],
    "source": [
-    "\n",
-    "\n",
     "#plot of high value transactions\n",
     "bins = np.linspace(200, 2500, 100)\n",
     "plt.hist(normal_df.Amount, bins, alpha=1, normed=True, label='Normal')\n",
@@ -476,8 +468,7 @@
     "plt.title(\"Amount by percentage of transactions (transactions \\$200+)\")\n",
     "plt.xlabel(\"Transaction amount (USD)\")\n",
     "plt.ylabel(\"Percentage of transactions (%)\");\n",
-    "plt.show()\n",
-    "\n"
+    "plt.show()"
    ]
   },
   {
@@ -556,13 +547,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "#data = df.drop(['Time'], axis=1) #if you think the var is unimportant\n",
     "df_norm = df\n",
     "df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))\n",
-    "df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))\n",
-    "\n"
+    "df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))"
    ]
   },
   {
@@ -573,9 +561,9 @@
    "source": [
     "train_x, test_x = train_test_split(df_norm, test_size=TEST_PCT, random_state=RANDOM_SEED)\n",
     "train_x = train_x[train_x.Class == 0] #where normal transactions\n",
-    "train_x = train_x.drop(['Class'], axis=1) #drop the class column\n",
-    "\n",
+    "train_x = train_x.drop(['Class'], axis=1) #drop the class column (as Autoencoder is unsupervised and does not need / use labels for training)\n",
     "\n",
+    "# test_x (without class) for validation; test_y (with Class) for prediction + calculating MSE / reconstruction error\n",
     "test_y = test_x['Class'] #save the class column for the test set\n",
     "test_x = test_x.drop(['Class'], axis=1) #drop the class column\n",
     "\n",
@@ -615,12 +603,16 @@
     "nb_epoch = 5\n",
     "batch_size = 32\n",
     "\n",
+    "\n",
+    "# Autoencoder: 30 => 14 => 7 => 7 => 14 => 30 dimensions\n",
     "input_dim = train_x.shape[1] #num of columns, 30\n",
     "encoding_dim = 14\n",
     "hidden_dim = int(encoding_dim / 2) #i.e. 7\n",
     "learning_rate = 1e-7\n",
     "\n",
+    "# Dense = fully connected layer \n",
     "input_layer = Input(shape=(input_dim, ))\n",
+    "# First parameter is output units (14 then 7 then 7 then 30) : \n",
     "encoder = Dense(encoding_dim, activation=\"tanh\", activity_regularizer=regularizers.l1(learning_rate))(input_layer)\n",
     "encoder = Dense(hidden_dim, activation=\"relu\")(encoder)\n",
     "decoder = Dense(hidden_dim, activation='tanh')(encoder)\n",
@@ -665,11 +657,11 @@
     "                write_graph=True,\n",
     "                write_images=True)\n",
     "\n",
-    "history = autoencoder.fit(train_x, train_x,\n",
+    "history = autoencoder.fit(train_x, train_x, # Autoencoder => Input == Output dimensions!\n",
     "                    epochs=nb_epoch,\n",
     "                    batch_size=batch_size,\n",
     "                    shuffle=True,\n",
-    "                    validation_data=(test_x, test_x),\n",
+    "                    validation_data=(test_x, test_x), # Autoencoder => Input == Output dimensions!\n",
     "                    verbose=1,\n",
     "                    callbacks=[cp, tb]).history"
    ]