In [55]:
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

In [2]:
columns = ['txId', 'timestep']
for x in range(165) :
    columns.append('feature_'+ str(x))


In [3]:
features_data= pd.read_csv("../input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_features.csv", names=columns)
classes_data= pd.read_csv("../input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
edgelist_data= pd.read_csv("../input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv")

In [4]:
features_data.head()

In [5]:
classes_data.head()

In [6]:
edgelist_data.head()

In [7]:
features_data.shape, classes_data.shape, edgelist_data.shape

In [8]:
classes_data['class'].value_counts()

In [9]:
features_data['timestep'].value_counts()

In [10]:
df = pd.merge(features_data, classes_data, left_on='txId', right_on='txId', how='left')

In [11]:
df.head()

In [12]:
df.isnull().values.any()

In [13]:
df = df[df['class'] != "unknown"]

In [14]:
df['class'].value_counts()

In [15]:
df['class'].value_counts().sort_values().plot(kind = 'pie',autopct='%1.0f%%')

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

print(plt.get_backend())

# close any existing plots
plt.close("all")


corr = df.iloc[:, 1:].corr()

arr_corr = corr.values#as_matrix()
# mask out the top triangle
arr_corr[np.triu_indices_from(arr_corr)] = np.nan

fig, ax = plt.subplots(figsize=(24, 18))

hm = sns.heatmap(arr_corr, cbar=True, vmin=-0.5, vmax=0.5,
                 fmt='.2f', annot_kws={'size': 3}, annot=True, 
                 square=True, cmap=plt.cm.Blues)

ticks = np.arange(corr.shape[0]) + 0.5
ax.set_xticks(ticks)
ax.set_xticklabels(corr.columns, rotation=90, fontsize=8)
ax.set_yticks(ticks)
ax.set_yticklabels(corr.index, rotation=360, fontsize=8)

ax.set_title('correlation matrix')
plt.tight_layout()
plt.savefig("corr_matrix_incl_anno_double.png", dpi=300)

In [17]:
def corrX_orig(df, cut = 0.9) :
        
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis = 1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(np.bool))
    
    drop = list()
    
    # For loop implements this pseudocode
    # For every cell in the upper triangle: 
        # If cell.value > 0.6: 
                # If mean(row_correlation) > mean(column_correlation):drop(column)
        # Else: drop(row)
        
    for row in range(len(up)-1):
        col_idx = row + 1
        for col in range (col_idx, len(up)):
            if(corr_mtx.iloc[row, col] > cut):
                if(avg_corr.iloc[row] > avg_corr.iloc[col]): 
                    drop.append(row)
                else: 
                    drop.append(col)
    
    drop_set = list(set(drop))
    dropcols_names = list(df.columns[[item for item in drop_set]])
    
    return(dropcols_names)

In [18]:
drop = corrX_orig(df, cut = 0.9)

In [19]:
print(drop)

In [20]:
len(drop)

In [21]:
drop.remove('timestep')
df = df.drop(drop, axis = 1)
print(df.columns)

In [22]:
df['class'] = df['class'].astype('str').values
df['class'] = df['class'].astype('category').values
df['class'] = df['class'].cat.codes
df['class'].value_counts()

In [23]:
y = df['class'].values

In [24]:
X = df.iloc[:, :-1].values


In [25]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif

In [26]:
# Create the object for SelectKBest and fit and transform the classification data
# k is the number of features you want to select [here it's 53]
X_clf_new=SelectKBest(score_func=f_classif,k=53).fit_transform(X,y)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_clf_new, y, test_size=0.3, random_state=42)

In [29]:
X_train.shape

In [30]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

## Naive Bayes

In [31]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
NB_y_pred = gnb.predict(X_test)

In [32]:
cm = confusion_matrix(y_test, NB_y_pred)
print(cm)
accuracy_score(y_test, NB_y_pred)

In [33]:
print(classification_report(y_test, NB_y_pred))

In [34]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(20, 20))

plot_confusion_matrix(gnb, X_test, y_test,cmap=plt.cm.Blues, ax=ax)

In [72]:
auc1 = metrics.roc_auc_score(y_test, NB_y_pred)

false_positive_rate1, true_positive_rate1, thresolds1 = metrics.roc_curve(y_test, NB_y_pred)

plt.figure(figsize=(10, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc1, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## RandomForest Classifier

In [57]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred_RF=clf.predict(X_test)

In [36]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [37]:
print(classification_report(y_test, y_pred))

In [38]:
fig, ax = plt.subplots(figsize=(20, 20))

plot_confusion_matrix(clf, X_test, y_test,cmap=plt.cm.Blues, ax=ax)

In [73]:
auc2 = metrics.roc_auc_score(y_test, y_pred_RF)

false_positive_rate2, true_positive_rate2, thresolds2 = metrics.roc_curve(y_test, y_pred_RF)

plt.figure(figsize=(10, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc2, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## K Neighbors Classifier

In [39]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train,y_train)
y_pred_knn =neigh.predict(X_test)

In [40]:
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)
accuracy_score(y_test, y_pred_knn)

In [41]:
print(classification_report(y_test, y_pred_knn))

In [42]:
fig, ax = plt.subplots(figsize=(20, 20))

plot_confusion_matrix(neigh, X_test, y_test,cmap=plt.cm.Blues, ax=ax)

In [74]:
auc3 = metrics.roc_auc_score(y_test, y_pred_knn)

false_positive_rate3, true_positive_rate3, thresolds3 = metrics.roc_curve(y_test, y_pred_knn)

plt.figure(figsize=(10, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc3, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

#  Comparison RF,KNN,NB by using ROC Curve

In [75]:
plt.plot(false_positive_rate1, true_positive_rate1, label='ROC Curve - NB (AUC = %0.2f)' % (auc1))
plt.plot(false_positive_rate2, true_positive_rate2, label='ROC Curve - RF (AUC = %0.2f)' % (auc2))
plt.plot(false_positive_rate3, true_positive_rate3, label='ROC Curve - KNN (AUC = %0.2f)' % (auc3))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')   
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.show()

## Deep Learning CNN

In [43]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [44]:
model = Sequential()
model.add(Dense(100, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(40, input_dim=X_train.shape[1], activation='relu'))
#model.add(Dense(20, kernel_initializer='normal'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])


In [45]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto')


In [46]:
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),
                    callbacks=[monitor],verbose=True,epochs=10)

In [47]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [48]:
plot_history(history)


In [49]:
# Running eval    
import numpy as np
pred_st = model.predict(X_test)
#pred_st = np.argmax(pred_st,axis=1)
pred_st = np.round(pred_st)

y_eval_st = y_test.astype(np.int64)

#y_eval_st = np.argmax(y_test,axis=1)
score_st = metrics.accuracy_score(y_eval_st, pred_st)
print("accuracy: {}".format(score_st))

# Classification Report
print(classification_report(y_eval_st, pred_st))#.argmax(-1)))


In [50]:
cm = confusion_matrix(y_eval_st, pred_st)
print(cm)
accuracy_score(y_eval_st, pred_st)

In [51]:
import matplotlib.pyplot as plt
import numpy as np

cm = metrics.confusion_matrix(y_eval_st, pred_st)
# or
#cm = np.array([[1401,    0],[1112, 0]])
fig, ax = plt.subplots(figsize=(20, 20))
plt.imshow(cm, cmap=plt.cm.Blues)
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.xticks([0,1],[0,1])
plt.xticks([0,1],[0,1])
plt.title('Confusion matrix ')
plt.colorbar()
plt.show()

| Classifier | Accuracy | Precision | Recall | F1 score |
| --- | --- | --- | --- | --- |
| Naive Bayes | 0.63 |  0.92 | 0.63 | 0.70 |
| RandomForestClassifier| 0.98 |  0.98 | 0.98 | 0.98 |
| K Neighbors Classifier| 0.96 |  0.96 | 0.96 | 0.96 |
| Deep Learning CNN| 0.98 |  0.97 | 0.98 | 0.97 |


In [52]:
import pandas as pd

data = {'Classifier': ['NB', 'RF', 'KNN', 'CNN'],
        'Accuracy': [63, 98, 96, 98],
        'Precision': [92, 98, 96, 97],
        'Recall': [63, 98, 96, 98],
        'F1 score': [70, 98, 96, 97],
        }

df = pd.DataFrame(data)

print (df)

In [53]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.20
 
# set heights of bars
bars1 = [63, 98, 96, 98]
bars2 = [92, 98, 96, 97]
bars3 = [63, 98, 96, 98]
bars4 = [70, 98, 96, 97]
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
 
# Make the plot
plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='Accuracy')
plt.bar(r2, bars2, color='#557f2d', width=barWidth, edgecolor='white', label='Precision')
plt.bar(r3, bars3, color='#3a2f4e', width=barWidth, edgecolor='white', label='Recall')
plt.bar(r4, bars4, color='#6d3f5e', width=barWidth, edgecolor='white', label='F1 score')

# Add xticks on the middle of the group bars
plt.xlabel('CLASSIFIERS', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['NB', 'RF', 'KNN', 'CNN'])
 
# Create legend & Show graphic
plt.rcParams["figure.figsize"] = (5,3)
plt.legend()
plt.show()