In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


In [2]:
cancer_data = pd.read_csv('cancer.data')

In [3]:
cancer_data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


##  Split the data into training and test sets

### Print DataFrame information

In [4]:
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
1000025    698 non-null int64
5          698 non-null int64
1          698 non-null int64
1.1        698 non-null int64
1.2        698 non-null int64
2          698 non-null int64
1.3        698 non-null object
3          698 non-null int64
1.4        698 non-null int64
1.5        698 non-null int64
2.1        698 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
### Inspect missing values in the dataset

In [6]:
cancer_data.isnull().sum()

1000025    0
5          0
1          0
1.1        0
1.2        0
2          0
1.3        0
3          0
1.4        0
1.5        0
2.1        0
dtype: int64

In [7]:
cancer_data.describe()

Unnamed: 0,1000025,5,1,1.1,1.2,2,3,1.4,1.5,2.1
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,1071807.0,4.416905,3.137536,3.210602,2.809456,3.217765,3.438395,2.869628,1.590258,2.690544
std,617532.3,2.817673,3.052575,2.972867,2.856606,2.215408,2.440056,3.055004,1.716162,0.951596
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870258.2,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [8]:
cancer_data = cancer_data.replace('?',np.nan)
cancer_data.isnull().sum()

1000025     0
5           0
1           0
1.1         0
1.2         0
2           0
1.3        16
3           0
1.4         0
1.5         0
2.1         0
dtype: int64

In [9]:
cancer_data = cancer_data.fillna(cancer_data['1.3'].value_counts().index[0])

In [10]:
cancer_data.isnull().sum()

1000025    0
5          0
1          0
1.1        0
1.2        0
2          0
1.3        0
3          0
1.4        0
1.5        0
2.1        0
dtype: int64

In [11]:
le = LabelEncoder()

In [12]:
cancer_data['1.3']=le.fit_transform(cancer_data['1.3'])

In [13]:
cancer_data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,1,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,0,3,1,1,2
4,1017122,8,10,10,8,7,1,9,7,1,4


In [14]:
cancer_data=cancer_data.values

In [15]:
X =cancer_data[:,0:10]
y =cancer_data[:,-1]

In [16]:
X_train, X_test, y_train,y_test  = train_test_split(X,
                                y,
                                test_size=0.30,
                                random_state=1)

In [17]:
print(X_train.shape, y_train.shape)

(488, 10) (488,)


In [18]:
print(X_test.shape, y_test.shape)

(210, 10) (210,)


In [19]:
scaler = MinMaxScaler(feature_range=(0 , 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

print(rescaledX_train.shape)
print(rescaledX_test.shape)

(488, 10)
(210, 10)


In [20]:
logreg = LogisticRegression()

In [21]:
logreg.fit(rescaledX_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
y_pred = logreg.predict(rescaledX_test)

In [24]:
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

Accuracy of logistic regression classifier:  0.9666666666666667


In [25]:
matrix = confusion_matrix(y_test,y_pred)
print(matrix)

[[141   1]
 [  6  62]]


In [26]:
logreg.summary()

AttributeError: 'LogisticRegression' object has no attribute 'summary'