In [207]:
# Imports
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [208]:
data = pd.read_csv('creditcard.csv')

In [209]:
# link: https://www.data-blogger.com/2017/06/15/fraud-detection-a-simple-machine-learning-approach/
# Make some make some observations about the dataset. Select one to run.
# Observations: 28 fields. Encoded for privacy reason except Amount, Time, and Class.
#               highly unbalanced
# data.head(100)
# data.shape
# data.info()
# print(data.describe()) # statistical summary of all the columns
# print(data['Class'].value_counts())             

In [210]:
# Select features: V1 ~ V28 and Amount
# Select target: Class
features = ['Amount'] + ['V%d' % index for index in range(1, 29)]
target = 'Class'
X = data[features]
y = data[target]

In [211]:
# Normalize values for each feature, because there are many features with wide range of values
# We need to get them in the equivalent range. Make the distribution of each values on the same scale.
# Normalization procedure: (value - mean)/std
# Normalization has to be done after split individually on both train and test sets.
def nomalization(X):
    for feature in X.columns:
        X[feature] -= X[feature].mean()
        X[feature] /= X[feature].std()
    return X

In [212]:
# 1. Define a model
# 2. Split the data using StratisfiedShuffleSplit to ensure that both train and test have roughly the same distribution
#    of Class variables, because the data is highly unbalanced. 
# 3. Random_state = none ensures the results is replicable.
# Tips: X = np.array([2,1,4,2,6,7,3]) 
#       indices = [0,5]
#       print(X[indices]) it will not work on pandas dataframe, have to use iloc.
# n_splits: how many times we want to split the data.
model = LogisticRegression()
splitter = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=None)
for train_indices, test_indices in splitter.split(X, y): 
    
    # train/test split.
    X_train = X.iloc[train_indices] 
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    # normalize data.
    X_train = nomalization(X_train)
    X_test = nomalization(X_test)
    
    # fit and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # test and show the results
    print(classification_report(y_test, y_pred))
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


             precision    recall  f1-score   support

          0       1.00      1.00      1.00    142158
          1       0.87      0.65      0.74       246

avg / total       1.00      1.00      1.00    142404

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    142158
          1       0.88      0.59      0.71       246

avg / total       1.00      1.00      1.00    142404

