In [2]:
import numpy
from sklearn import datasets, utils
from sklearn.model_selection import cross_val_score
import pandas.io
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.externals import joblib


#
# DECLARATION PART
#
PIPELINEPATH= "ser_pipeline.pickle"
DATASETPATH= "lymphography.data"



# if there is no header row with the column/attribute names, use the constant None for row_with_column_names
def readCsvToDataFrame(path, row_with_column_names):
    theDataFrame= pandas.read_csv(path, header=row_with_column_names)
    return theDataFrame



def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("amount of entries is %s" % dataframe.size)
    print("dimensions= %i" % dataframe.ndim)
    print("shape is ", end="")
    print(dataframe.shape)
    print("axes: ", end="")
    print(dataframe.axes)
    print("data types of columns:")
    print(dataframe.dtypes)
    print("features: %s" % dataframe.columns)



def sliceDataFrame(df):
    # remove 2 instances with class label "normal find"
    df= df.drop(df[df[0]==1].index)
    # remove 4 instances with class label "fibrosis"
    df = df.drop(df[df[0] == 4].index)
    # iloc arguments: range of rows, range of columns
    # class 'pandas.core.series.Series'
    return df.iloc[:, 1:], df.iloc[:, 0]



#
# PROGRAM BODY
#

## PHASE 1: LOAD DATASET
dataset= readCsvToDataFrame(DATASETPATH, None)
show_df_info(dataset)
print(dataset.head(5))

## PHASE 2: SLICE DATASET
training_instances, class_labels= sliceDataFrame(dataset)
show_df_info(training_instances)
# preview the data
print(training_instances.head(5))
print()
print(class_labels.head(5))

## PHASE 3: CREATE PIPELINE
cart_model= tree.DecisionTreeClassifier()
pipe= pipeline.Pipeline(steps=[("feature_selection", SelectKBest(chi2, k=8)), ("scale", preprocessing.StandardScaler()),  ("CART", cart_model)])

## PHASE 4: TRAIN
# fit all stages of the pipeline
pipe.fit(training_instances, y=class_labels)

## PHASE 5: EVALUATE
# return value is array of scores
scores = cross_val_score(pipe, training_instances, class_labels, cv=5)
# use as quality metric the average CV score
meanCvAccuracy= scores.mean()
print("Mean CV accuracy= %f" % meanCvAccuracy)

## PHASE 6: SAVE PIPELINE
# the whole pipeline in one single file
joblib.dump(pipe, PIPELINEPATH, compress = 1)

## PHASE 7: LOAD THE PIPELINE
# read the file and deserialize the pipeline
pipeline_loaded = joblib.load(PIPELINEPATH)

## PHASE 8: CLASSIFY NEW INSTANCES
# create new random problem instance
vector= numpy.random.randint(0, 8, size=18)
print(vector)
result= pipeline_loaded.predict([vector,])
print("class label is %i" % result)

print("--- end of execution ---")

<class 'pandas.core.frame.DataFrame'>
amount of entries is 2812
dimensions= 2
shape is (148, 19)
axes: [RangeIndex(start=0, stop=148, step=1), Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], dtype='int64')]
data types of columns:
0     int64
1     int64
2     int64
3     int64
4     int64
5     int64
6     int64
7     int64
8     int64
9     int64
10    int64
11    int64
12    int64
13    int64
14    int64
15    int64
16    int64
17    int64
18    int64
dtype: object
features: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], dtype='int64')
   0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18
0   3   4   2   1   1   1   1   1   2   1   2   2   2   4   8   1   1   2   2
1   2   3   2   1   1   2   2   1   2   1   3   3   2   3   4   2   2   2   2
2   3   3   2   2   2   2   2   2   2   1   4   3   3   4   8   3   2   2   7
3   3   3   1   1   1   1   2   1   2   1   3   3   4   4   4   3   1   2   6
4  