### Phase 1: Load Dataset

In [10]:
import pandas.io 
def readCsvToDataFrame(path): 
    theDataFrame= pandas.read_csv(path) 
    return theDataFrame

In [41]:
PIPELINEPATH= "ser_pipeline2.pickle"
DATASETPATH = '../creditcard.csv'
dataset = readCsvToDataFrame(DATASETPATH)

In [14]:
def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("amount of entries is %s" % dataframe.size)
    print("dimensions= %i" % dataframe.ndim)
    print("shape is ", end="")
    print(dataframe.shape)
    print("axes: ", end="")
    print(dataframe.axes)
    print("data types of columns:")
    print(dataframe.dtypes)
    print("features: %s" % dataframe.columns)

In [15]:
show_df_info(dataset)
print(dataset.head(5))

<class 'pandas.core.frame.DataFrame'>
amount of entries is 8829017
dimensions= 2
shape is (284807, 31)
axes: [RangeIndex(start=0, stop=284807, step=1), Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')]
data types of columns:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
d

In [16]:
dataset.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

### Phase 2: Slice the Dataset

In [25]:
def sliceDataFrame(df):
    # iloc arguments: range of rows, range of columns
    # class 'pandas.core.series.Series'
    return df.iloc[:, :-1], df.iloc[:, -1]

In [26]:
training_instances, class_labels = sliceDataFrame(dataset)
show_df_info(training_instances)

# preview the data
print(training_instances.head(5))
print()
print(class_labels.head(5))

<class 'pandas.core.frame.DataFrame'>
amount of entries is 8544210
dimensions= 2
shape is (284807, 30)
axes: [RangeIndex(start=0, stop=284807, step=1), Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')]
data types of columns:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
dtype: object
features: Index(['Tim

### Phase 3: Create the Pipeline

In [34]:
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest

In [35]:
cart_model = tree.DecisionTreeClassifier()

pipe = pipeline.Pipeline(steps=[
    ("scale", preprocessing.StandardScaler()),  
    ("CART", cart_model)])

In [36]:
steps=[
    ("scale", preprocessing.StandardScaler()),  
    ("CART", cart_model)]

In [37]:
preprocessing.StandardScaler()

StandardScaler(copy=True, with_mean=True, with_std=True)

### Phase 4: Train

In [38]:
pipe.fit(training_instances, y=class_labels)

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('CART', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

### Phase 5: Evaluation

In [39]:
# return value is array of scores
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipe, training_instances, class_labels, cv=5)

# use as quality metric the average CV score
meanCvAccuracy = scores.mean()

print("Mean CV accuracy= %f" % meanCvAccuracy)

Mean CV accuracy= 0.800172


### Phase 6: Save the Pipeline

In [42]:
from sklearn.externals import joblib
joblib.dump(pipe, PIPELINEPATH, compress = 1)

['ser_pipeline2.pickle']

### Phase 7: Load the Pipeline

In [43]:
pipeline_loaded = joblib.load(PIPELINEPATH)

### Phase 8: Classify New Instances

In [46]:
dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [49]:
# create new  problem instance
vector = [2.0, -0.876536, -0.182937, 1.817382, -0.889898, -0.012989, 1.234879, 0.234543, 0.372123, -1.398765,  
          -0.876536, -0.182937, 1.817382, -0.889898, -0.012989, 1.234879, 0.234543, 0.372123, -1.398765, 0.247676,
          0.791461, -0.109876, 0.008765, -0.198765, -1.176543, 0.765432, -0.228765, 0.067854, 0.067543, 126.80]

result = pipeline_loaded.predict([vector,])
print("class label is %i" % result)

class label is 0


### The Whole Program

In [54]:
import numpy
from sklearn import datasets, utils
from sklearn.model_selection import cross_val_score
import pandas.io
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.externals import joblib


#
# DECLARATION PART
#
PIPELINEPATH= 'ser_pipeline.pickle'
DATASETPATH= '../creditcard.csv'



# if there is no header row with the column/attribute names, use the constant None for row_with_column_names
def readCsvToDataFrame(path):
    theDataFrame= pandas.read_csv(path)
    return theDataFrame



def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("amount of entries is %s" % dataframe.size)
    print("dimensions= %i" % dataframe.ndim)
    print("shape is ", end="")
    print(dataframe.shape)
    print("axes: ", end="")
    print(dataframe.axes)
    print("data types of columns:")
    print(dataframe.dtypes)
    print("features: %s" % dataframe.columns)



def sliceDataFrame(df):
    # iloc arguments: range of rows, range of columns
    # class 'pandas.core.series.Series'
    return df.iloc[:, :-1], df.iloc[:, -1]

#
# PROGRAM BODY
#

## PHASE 1: LOAD DATASET
dataset= readCsvToDataFrame(DATASETPATH)
show_df_info(dataset)
print(dataset.head(5))

## PHASE 2: SLICE DATASET
training_instances, class_labels= sliceDataFrame(dataset)
show_df_info(training_instances)
# preview the data
print(training_instances.head(5))
print()
print(class_labels.head(5))

## PHASE 3: CREATE PIPELINE
cart_model= tree.DecisionTreeClassifier()
pipe= pipeline.Pipeline(steps=[("scale", preprocessing.StandardScaler()),  ("CART", cart_model)])

## PHASE 4: TRAIN
# fit all stages of the pipeline
pipe.fit(training_instances, y=class_labels)

## PHASE 5: EVALUATE
# return value is array of scores
scores = cross_val_score(pipe, training_instances, class_labels, cv=5)
# use as quality metric the average CV score
meanCvAccuracy= scores.mean()
print("Mean CV accuracy= %f" % meanCvAccuracy)

## PHASE 6: SAVE PIPELINE
# the whole pipeline in one single file
joblib.dump(pipe, PIPELINEPATH, compress = 1)

## PHASE 7: LOAD THE PIPELINE
# read the file and deserialize the pipeline
pipeline_loaded = joblib.load(PIPELINEPATH)

## PHASE 8: CLASSIFY NEW INSTANCES
# create new problem instance
vector = [2.0, -0.876536, -0.182937, 1.817382, -0.889898, -0.012989, 1.234879, 0.234543, 0.372123, -1.398765,  
          -0.876536, -0.182937, 1.817382, -0.889898, -0.012989, 1.234879, 0.234543, 0.372123, -1.398765, 0.247676,
          0.791461, -0.109876, 0.008754, -0.198765, -1.176543, 0.765432, -0.228765, 0.067854, 0.067543, 126.80]

result= pipeline_loaded.predict([vector,])
print("class label is %i" % result)

print("--- end of execution ---")

<class 'pandas.core.frame.DataFrame'>
amount of entries is 8829017
dimensions= 2
shape is (284807, 31)
axes: [RangeIndex(start=0, stop=284807, step=1), Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')]
data types of columns:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
d