|
332 | 332 | } |
333 | 333 | ], |
334 | 334 | "source": [ |
335 | | - "\n", |
336 | | - "\n", |
337 | | - "pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud\n", |
338 | | - "\n" |
| 335 | + "pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud" |
339 | 336 | ] |
340 | 337 | }, |
341 | 338 | { |
|
357 | 354 | } |
358 | 355 | ], |
359 | 356 | "source": [ |
360 | | - "\n", |
361 | | - "\n", |
362 | 357 | "#if you don't have an intuitive sense of how imbalanced these two classes are, let's go visual\n", |
363 | 358 | "count_classes = pd.value_counts(df['Class'], sort = True)\n", |
364 | 359 | "count_classes.plot(kind = 'bar', rot=0)\n", |
365 | 360 | "plt.xticks(range(2), LABELS)\n", |
366 | 361 | "plt.title(\"Frequency by observation number\")\n", |
367 | 362 | "plt.xlabel(\"Class\")\n", |
368 | | - "plt.ylabel(\"Number of Observations\");\n", |
369 | | - "\n" |
| 363 | + "plt.ylabel(\"Number of Observations\");" |
370 | 364 | ] |
371 | 365 | }, |
372 | 366 | { |
|
466 | 460 | } |
467 | 461 | ], |
468 | 462 | "source": [ |
469 | | - "\n", |
470 | | - "\n", |
471 | 463 | "#plot of high value transactions\n", |
472 | 464 | "bins = np.linspace(200, 2500, 100)\n", |
473 | 465 | "plt.hist(normal_df.Amount, bins, alpha=1, normed=True, label='Normal')\n", |
|
476 | 468 | "plt.title(\"Amount by percentage of transactions (transactions \\$200+)\")\n", |
477 | 469 | "plt.xlabel(\"Transaction amount (USD)\")\n", |
478 | 470 | "plt.ylabel(\"Percentage of transactions (%)\");\n", |
479 | | - "plt.show()\n", |
480 | | - "\n" |
| 471 | + "plt.show()" |
481 | 472 | ] |
482 | 473 | }, |
483 | 474 | { |
|
556 | 547 | "metadata": {}, |
557 | 548 | "outputs": [], |
558 | 549 | "source": [ |
559 | | - "\n", |
560 | | - "\n", |
561 | 550 | "#data = df.drop(['Time'], axis=1) #if you think the var is unimportant\n", |
562 | 551 | "df_norm = df\n", |
563 | 552 | "df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))\n", |
564 | | - "df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))\n", |
565 | | - "\n" |
| 553 | + "df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))" |
566 | 554 | ] |
567 | 555 | }, |
568 | 556 | { |
|
573 | 561 | "source": [ |
574 | 562 | "train_x, test_x = train_test_split(df_norm, test_size=TEST_PCT, random_state=RANDOM_SEED)\n", |
575 | 563 | "train_x = train_x[train_x.Class == 0] #where normal transactions\n", |
576 | | - "train_x = train_x.drop(['Class'], axis=1) #drop the class column\n", |
577 | | - "\n", |
| 564 | + "train_x = train_x.drop(['Class'], axis=1) #drop the class column (as Autoencoder is unsupervised and does not need / use labels for training)\n", |
578 | 565 | "\n", |
| 566 | + "# test_x (without class) for validation; test_y (with Class) for prediction + calculating MSE / reconstruction error\n", |
579 | 567 | "test_y = test_x['Class'] #save the class column for the test set\n", |
580 | 568 | "test_x = test_x.drop(['Class'], axis=1) #drop the class column\n", |
581 | 569 | "\n", |
|
615 | 603 | "nb_epoch = 5\n", |
616 | 604 | "batch_size = 32\n", |
617 | 605 | "\n", |
| 606 | + "\n", |
| 607 | + "# Autoencoder: 30 => 14 => 7 => 7 => 14 => 30 dimensions\n", |
618 | 608 | "input_dim = train_x.shape[1] #num of columns, 30\n", |
619 | 609 | "encoding_dim = 14\n", |
620 | 610 | "hidden_dim = int(encoding_dim / 2) #i.e. 7\n", |
621 | 611 | "learning_rate = 1e-7\n", |
622 | 612 | "\n", |
| 613 | + "# Dense = fully connected layer \n", |
623 | 614 | "input_layer = Input(shape=(input_dim, ))\n", |
| 615 | + "# First parameter is output units (14 then 7 then 7 then 30) : \n", |
624 | 616 | "encoder = Dense(encoding_dim, activation=\"tanh\", activity_regularizer=regularizers.l1(learning_rate))(input_layer)\n", |
625 | 617 | "encoder = Dense(hidden_dim, activation=\"relu\")(encoder)\n", |
626 | 618 | "decoder = Dense(hidden_dim, activation='tanh')(encoder)\n", |
|
665 | 657 | " write_graph=True,\n", |
666 | 658 | " write_images=True)\n", |
667 | 659 | "\n", |
668 | | - "history = autoencoder.fit(train_x, train_x,\n", |
| 660 | + "history = autoencoder.fit(train_x, train_x, # Autoencoder => Input == Output dimensions!\n", |
669 | 661 | " epochs=nb_epoch,\n", |
670 | 662 | " batch_size=batch_size,\n", |
671 | 663 | " shuffle=True,\n", |
672 | | - " validation_data=(test_x, test_x),\n", |
| 664 | + " validation_data=(test_x, test_x), # Autoencoder => Input == Output dimensions!\n", |
673 | 665 | " verbose=1,\n", |
674 | 666 | " callbacks=[cp, tb]).history" |
675 | 667 | ] |
|
0 commit comments