### importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### dataset

In [2]:
df = pd.read_csv('breast-cancer-wisconsin.data')
df.columns = ['Sample code number', 'Clump thickness', 'Uniformity of cell size', 'Uniformity of cell shape',
              'Marginal adhesion', 'Single epithelial cell size', 'Bare nuclei', 'Bland chromatin',
              'Normal nucleoli', 'Mitoses', 'Class']

### checking information in dataset

In [3]:
df.head()

Unnamed: 0,Sample code number,Clump thickness,Uniformity of cell size,Uniformity of cell shape,Marginal adhesion,Single epithelial cell size,Bare nuclei,Bland chromatin,Normal nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [4]:
df.describe()

Unnamed: 0,Sample code number,Clump thickness,Uniformity of cell size,Uniformity of cell shape,Marginal adhesion,Single epithelial cell size,Bland chromatin,Normal nucleoli,Mitoses,Class
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,1071807.0,4.416905,3.137536,3.210602,2.809456,3.217765,3.438395,2.869628,1.590258,2.690544
std,617532.3,2.817673,3.052575,2.972867,2.856606,2.215408,2.440056,3.055004,1.716162,0.951596
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870258.2,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           698 non-null    int64 
 1   Clump thickness              698 non-null    int64 
 2   Uniformity of cell size      698 non-null    int64 
 3   Uniformity of cell shape     698 non-null    int64 
 4   Marginal adhesion            698 non-null    int64 
 5   Single epithelial cell size  698 non-null    int64 
 6   Bare nuclei                  698 non-null    object
 7   Bland chromatin              698 non-null    int64 
 8   Normal nucleoli              698 non-null    int64 
 9   Mitoses                      698 non-null    int64 
 10  Class                        698 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


### missing values in df['Bare nuclei']

In [6]:
df[:].replace('?', np.nan, inplace=True)
df['Bare nuclei'].fillna(df['Bare nuclei'].mode()[0], inplace=True)
df['Bare nuclei'] = pd.to_numeric(df['Bare nuclei'], errors='coerce')
df['Bare nuclei'] = df['Bare nuclei'].fillna(-1).astype('int64')
df.describe()

Unnamed: 0,Sample code number,Clump thickness,Uniformity of cell size,Uniformity of cell shape,Marginal adhesion,Single epithelial cell size,Bare nuclei,Bland chromatin,Normal nucleoli,Mitoses,Class
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,1071807.0,4.416905,3.137536,3.210602,2.809456,3.217765,3.489971,3.438395,2.869628,1.590258,2.690544
std,617532.3,2.817673,3.052575,2.972867,2.856606,2.215408,3.623301,2.440056,3.055004,1.716162,0.951596
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870258.2,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           698 non-null    int64
 1   Clump thickness              698 non-null    int64
 2   Uniformity of cell size      698 non-null    int64
 3   Uniformity of cell shape     698 non-null    int64
 4   Marginal adhesion            698 non-null    int64
 5   Single epithelial cell size  698 non-null    int64
 6   Bare nuclei                  698 non-null    int64
 7   Bland chromatin              698 non-null    int64
 8   Normal nucleoli              698 non-null    int64
 9   Mitoses                      698 non-null    int64
 10  Class                        698 non-null    int64
dtypes: int64(11)
memory usage: 60.1 KB


### X and y

In [8]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

### train test splitting the data

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### train the logistic regression model

In [10]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

### predict the result

In [11]:
y_pred = classifier.predict(X_test)

### confusion matrix

In [12]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[91  5]
 [ 2 42]]


In [13]:
print((83+53)/(2+2+83+53))

0.9714285714285714


### k-fold cross validation

In [14]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(f'Accuracy = {acc.mean()*100:.2f}%')
print(f'Standard Deviation = {acc.std()*100:.2f}%')

Accuracy = 97.14%
Standard Deviation = 1.82%
