Table of Contents
<div id="toc"></div>

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

# Analysis
## Data Exploration

In [None]:
# Math stuff
import numpy as np
# For handling dataset
import pandas as pd
# For plotting
import matplotlib.pyplot as plt
# For plotting
#import seaborn as sns
#sns.set_style("white")
# For list the feature importance
from sklearn.ensemble import RandomForestClassifier
# For feature transformation
from sklearn.preprocessing import FunctionTransformer
# helping to remove outliers
from scipy.stats import iqr
# For undersampling
from imblearn.under_sampling import RandomUnderSampler
# metrics
from sklearn.metrics import precision_recall_curve

Let's load the data in order to do some analysis

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
df.head()

In [None]:
df.dtypes

As we can see, the dataset contains 31 features which each one is a numerical data. <br />
The feature Class is a categorical data labeling if the transaction is legit or fraud. <br />
One of the characteristics is the highly imbalance between legit and fraud transaction.

So, let's plot and see it.

In [None]:
fig = plt.figure()

sns.countplot(df.Class, palette="Set2")
plt.xticks([0,1], ['Legit', 'Fraud'])
plt.title('Ratio between legit x fraudulent transactions')

fig.savefig('imgs/fig2 ratio-legitxfraud.png')

plt.show()

Now, let's summarize the whole data...

In [None]:
df.describe().transpose()[::-1]

This summary is hard to see something. But, here some things to notice:
* The mean of the feature V1 to V28 are next to zero.
* The difference between the max value and the 75% quantile of the feature Amount is huge.


Let's plot the distributions of the features Vs.

In [None]:
feats = df.columns[1:-2]
print feats

In [None]:
fig = plt.figure(figsize=(15, 12))

for idx, feat in enumerate(feats):
    plt.subplot(7, 4, idx+1) 
    sns.distplot(df[feat])

plt.tight_layout()
plt.show()

fig.savefig('imgs/Vs distplot.png')

The plot above is clear to see that the mean is around zero. So, it reflects the summary. <br />
Another thing to notice is that by the extension of the x-axis, show us that all them have outliers. <br />

In [None]:
fig = plt.figure()

plt.title('Distribution of the feature "Amount"')
sns.distplot(df.Amount, bins=1000)
plt.xlim(0, 1000)
plt.show()

fig.savefig('imgs/fig3.png')

The feature amount is positively-skewed. So, it will need a transformation.

In [None]:
fig = plt.figure()

sns.distplot(df.Time)
plt.title('Distribution of the feature "Time"')
plt.show()

fig.savefig('imgs/fig4.png')

This feature presents somehow a bimodal distribution. Since, the dataset contains transactions of two days, this could be a reflection of it. <br /><br />
Now that we had an overview about the dataset, it might be a good idea to choose the features that most explain the relation between legit and fraud.

I'll be doing this through Random Forest and list the features by importance. <br />
Since Decision Trees aren't sensitive to outliers, it should be fine to just run it.

In [None]:
# Since I won't be working with recurrent model, I'll be ignoring this feature
del df['Time']

In [None]:
# making a copy of the data to work
X = df.iloc[:,:].copy()

In [None]:
np.bincount(X.Class)

In [None]:
# separating the predictors and the labels
X, y = X.iloc[:,:-1], X.iloc[:,-1]

In [None]:
rus = RandomUnderSampler(ratio={0:492*20, 1:492}, random_state=0)

In [None]:
X_resampled, y_resampled = rus.fit_sample(X, y)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameter_candidates = [
  {'n_estimators': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], \
   'random_state':[0], 'class_weight':['balanced', None], 'bootstrap':[True], \
   'oob_score':[True, False], 'min_samples_split':[2, 4, 6, 8], 'min_samples_leaf':[1, 2, 4, 6, 8]},

  {'n_estimators': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], \
   'random_state':[0], 'class_weight':['balanced', None], 'bootstrap':[False], \
   'min_samples_split':[2, 4, 6, 8], 'min_samples_leaf':[1, 2, 4, 6, 8]}
]

In [None]:
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameter_candidates, n_jobs=-1)

In [None]:
clf.fit(X_resampled, y_resampled)

In [None]:
clf.best_estimator_

In [None]:
# running the RF
clf = RandomForestClassifier(random_state=0, class_weight='balanced', criterion='entropy', n_estimators=50)
clf.fit(X_resampled, y_resampled)

In [None]:
# before we list the feature importance, we'll see if the classifier is working
y_pred = clf.predict(X)
precision, recall, _ = precision_recall_curve(y_pred, y)
print 'precision %.2f' % precision[1]
print 'recall %.2f' % recall[1]

In [None]:
# both precision and recall are next to 1.
# this is excellent, now we'll list the five most important features
#importance = {item[0]:float(item[1]) for item in zip(df.columns[1:-1], clf.feature_importances_)}
importance = {item[0]:float(item[1]) for item in zip(df.columns[1:-1], clf.best_estimator_.feature_importances_)}
feat_imp = []
for w in sorted(importance.iteritems(), key=lambda (k,v):(v,k), reverse=True):
    feat_imp.append(np.array(w))
feat_imp = np.array(feat_imp)
print feat_imp[:5]

sum([ float(feat[1]) for feat in feat_imp[:3]])

Now that we removed the outliers, we can run the Random Forest

In [None]:
# separating the predictors and the labels
X, y = X.iloc[:,:-1], X.iloc[:,-1]

In [None]:
xfeat, yfeat, zfeat = feat_imp[0][0], feat_imp[1][0], feat_imp[2][0]

ax = plt.subplot(131)
X[y==0].plot.scatter(x=xfeat, y=yfeat, ax=ax, alpha=.5)
X[y==1].plot.scatter(x=xfeat, y=yfeat, ax=ax, alpha=.5, c='r')

ax = plt.subplot(132)
X[y==0].plot.scatter(x=xfeat, y=zfeat, ax=ax, alpha=.5)
X[y==1].plot.scatter(x=xfeat, y=zfeat, ax=ax, alpha=.5, c='r')

ax = plt.subplot(133)
X[y==0].plot.scatter(x=yfeat, y=zfeat, ax=ax, alpha=.5)
X[y==1].plot.scatter(x=yfeat, y=zfeat, ax=ax, alpha=.5, c='r')

plt.tight_layout()
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
%matplotlib nbagg

In [None]:
legit, fraud = df[df.Class==0], df[df.Class==1]

In [None]:
idx = np.random.choice(range(len(legit)), 1000)

In [None]:
legit = legit.loc[idx]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(fraud[xfeat], fraud[yfeat], fraud[zfeat], c='r', s=5, label='fraud')
ax.scatter(legit[xfeat], legit[yfeat], legit[zfeat], c='b', s=5, label='legit')

ax.set_xlabel(xfeat)
ax.set_ylabel(yfeat)
ax.set_zlabel(zfeat)

plt.title('Scatterplot between %s x %s x %s'%(xfeat, yfeat, zfeat))
plt.legend()

plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
X_new = df.iloc[:,:].copy()

In [None]:
X_new = X_new[['V11', 'V13', 'V15', 'Class']]

In [None]:
X_new, y_new = X_new.iloc[:,:-1], X_new.iloc[:,-1]

In [None]:
rus = RandomUnderSampler(ratio={0:492*10, 1:492})

In [None]:
clf = GaussianNB()
for i in range(50):
    X_resampled, y_resampled = rus.fit_sample(X_new, y_new)
    clf.partial_fit(X_resampled, y_resampled, classes=[0,1])

In [None]:
y_pred = clf.predict(X_new)
print y_pred[:5]

In [None]:
precision, recall, _ = precision_recall_curve(y_pred, y_new)
print 'precision %.2f' % precision[1]
print 'recall %.2f' % recall[1]

In [None]:
ax = plt.subplot(131)
df.boxplot('V11', 'Class', ax=ax)

ax = plt.subplot(132)
df.boxplot('V13', 'Class', ax=ax)

ax = plt.subplot(133)
df.boxplot('V15', 'Class', ax=ax)

plt.tight_layout()
plt.show()

In [None]:
# separating the legits from the fraudulents
isFraud = np.array(y_new==1, dtype=bool)
legit, fraud = X_new[~isFraud], X_new[isFraud]

In [None]:
# removing outliers from legit sample
for idx, feat in enumerate(legit.columns):
    q75, q25 = np.percentile(legit[feat], [75 ,25])
    iqr_ = iqr(legit[feat])*1.5
    
    greater = np.array(legit[feat] < q25 - iqr_, dtype=bool)
    legit.loc[greater, feat] = np.nan
    
    lower   = np.array(legit[feat] > q75 + iqr_, dtype=bool)
    legit.loc[lower, feat] = np.nan

In [None]:
print len(legit)
legit.head()

# dropping samples with any na
X_new = pd.DataFrame.dropna(legit, how='any')
print len(X_new)

In [None]:
# now, I'll merge with the fraudulent ones
X_new = pd.concat([X_new, fraud], axis=0)
print len(X_new)
X_new.head()

In [None]:
X_new = pd.concat([X_new, y_new], axis=1)

In [None]:
ax = plt.subplot(131)
X_new.boxplot('V11', 'Class', ax=ax)

ax = plt.subplot(132)
X_new.boxplot('V13', 'Class', ax=ax)

ax = plt.subplot(133)
X_new.boxplot('V15', 'Class', ax=ax)

plt.tight_layout()
plt.show()