In [2]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
data=pd.read_csv('mammographic_masses.data',names=['BI-RADS','Age','Shape','Margin','Density','Severity'],na_values='?')

In [4]:
data.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
BI-RADS     959 non-null float64
Age         956 non-null float64
Shape       930 non-null float64
Margin      913 non-null float64
Density     885 non-null float64
Severity    961 non-null int64
dtypes: float64(5), int64(1)
memory usage: 45.2 KB


In [6]:
data=data.dropna(how='any')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 830 entries, 0 to 960
Data columns (total 6 columns):
BI-RADS     830 non-null float64
Age         830 non-null float64
Shape       830 non-null float64
Margin      830 non-null float64
Density     830 non-null float64
Severity    830 non-null int64
dtypes: float64(5), int64(1)
memory usage: 45.4 KB


In [8]:
data['Severity'].value_counts()

0    427
1    403
Name: Severity, dtype: int64

# Replacing the categorical numbers with names , so as to easily use sklearn one hot encoder

In [9]:
data['Shape']=data['Shape'].map({1:'round',2:'oval',3:'lobular',4:'irregular'})

In [10]:
data['Margin']=data['Margin'].map({1:'circumscribed',2:'microlobulated',3:'obscured',4:'ill-defined',5:'spiculated'})

In [11]:
data.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,lobular,spiculated,3.0,1
2,5.0,58.0,irregular,spiculated,3.0,1
3,4.0,28.0,round,circumscribed,3.0,0
8,5.0,57.0,round,spiculated,3.0,1
10,5.0,76.0,round,ill-defined,3.0,1


In [12]:
Y=data['Severity']
X=data.drop(columns=['Severity'])

In [13]:
Y.head()

0     1
2     1
3     0
8     1
10    1
Name: Severity, dtype: int64

In [14]:
X.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
0,5.0,67.0,lobular,spiculated,3.0
2,5.0,58.0,irregular,spiculated,3.0
3,4.0,28.0,round,circumscribed,3.0
8,5.0,57.0,round,spiculated,3.0
10,5.0,76.0,round,ill-defined,3.0


# Function to get numerical columns

In [15]:
getnumeric=FunctionTransformer(lambda x: x[['BI-RADS','Age','Density']],validate=False)

# Function to get categorical columns

In [16]:
getcategory=FunctionTransformer(lambda x : x[['Shape','Margin']],validate=False)

# Pipeline for numerical processing, we have no processing to do.

In [17]:
numeric_pipeline=Pipeline([('select_numeric',getnumeric)])

# Pipeline for categorical processing , we include one hot encoder in that.

In [18]:
categorical_pipeline=Pipeline([('select_categorical',getcategory),
                              ('binarize',OneHotEncoder(handle_unknown='ignore',sparse=False))])

In [19]:
categorical_pipeline.fit_transform(X)

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.]])

# Using feature union to concatenate both pipeline outputs side by side.

In [20]:
final_pipeline=Pipeline([('union',FeatureUnion(transformer_list=[('numeric',numeric_pipeline),
                                      ('categorical',categorical_pipeline)])),
                         
                         ('classifier',DecisionTreeClassifier(max_depth=4))
                        
                        ])

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [22]:
final_pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('numeric',
                                                 Pipeline(memory=None,
                                                          steps=[('select_numeric',
                                                                  FunctionTransformer(accept_sparse=False,
                                                                                      check_inverse=True,
                                                                                      func=<function <lambda> at 0x000001D6B45D1BF8>,
                                                                                      inv_kw_args=None,
                                                                                      inverse_func=None,
                                                                                      kw_args=None,
                                      

In [23]:
final_pipeline.score(X_test,Y_test)

0.7831325301204819

In [24]:
final_pipeline.score(X_train,Y_train)

0.8614457831325302