## Logistic Regression

Importing the libraries

In [56]:
import numpy as np
import pandas as pd

Importing the dataset

In [57]:
dataset = pd.read_csv('breast-cancer-wisconsin.data')
new_row = list(dataset.columns)
#print(new_row)
new_row = [int(float(value)) for value in new_row]

dataset.columns = (['Sample_code_number', 'Clump_thickness', 'Uniformity_of_cell_size', 'Uniformity_of_cell_shape', 'Marginal_adhesion', 'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses', 'Class'])
# Convert the column to integers, treating NaN as a missing value
dataset['Bare_nuclei'] = pd.to_numeric(dataset['Bare_nuclei'], errors='coerce').astype('Int64')

# Adding (not replacing) a list of integers (new_row) to a data frame (dataset) as the first row
new_row = pd.DataFrame([new_row], columns = dataset.columns)
dataset = pd.concat([new_row, dataset], ignore_index=True)

dataset

Unnamed: 0,Sample_code_number,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [58]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample_code_number           699 non-null    int64
 1   Clump_thickness              699 non-null    int64
 2   Uniformity_of_cell_size      699 non-null    int64
 3   Uniformity_of_cell_shape     699 non-null    int64
 4   Marginal_adhesion            699 non-null    int64
 5   Single_epithelial_cell_size  699 non-null    int64
 6   Bare_nuclei                  683 non-null    Int64
 7   Bland_chromatin              699 non-null    int64
 8   Normal_nucleoli              699 non-null    int64
 9   Mitoses                      699 non-null    int64
 10  Class                        699 non-null    int64
dtypes: Int64(1), int64(10)
memory usage: 60.9 KB


In [59]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
dataset['Bare_nuclei'] = imputer.fit_transform(dataset[['Bare_nuclei']]).astype('int64')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample_code_number           699 non-null    int64
 1   Clump_thickness              699 non-null    int64
 2   Uniformity_of_cell_size      699 non-null    int64
 3   Uniformity_of_cell_shape     699 non-null    int64
 4   Marginal_adhesion            699 non-null    int64
 5   Single_epithelial_cell_size  699 non-null    int64
 6   Bare_nuclei                  699 non-null    int64
 7   Bland_chromatin              699 non-null    int64
 8   Normal_nucleoli              699 non-null    int64
 9   Mitoses                      699 non-null    int64
 10  Class                        699 non-null    int64
dtypes: int64(11)
memory usage: 60.2 KB


In [60]:
X = dataset.iloc[: , 1:-1].values
y = dataset.iloc[: , -1].values
#print(X)

Splitting the dataset into the Training set and Test set

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Training the Logistic Regression model on the Training set

In [62]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

Predicting the Test set results

In [63]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[2 2]
 [2 2]
 [4 2]
 [2 2]
 [4 4]
 [2 2]
 [4 4]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 4]
 [4 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [4 4]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [4 4]
 [2 2]
 [2 2]
 [4 4]
 [2 2]
 [2 2]]


Making the Confusion Matrix

In [64]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Martox: \n', cm)
from sklearn.metrics import accuracy_score
print('Accuracy Score is: ', accuracy_score(y_test, y_pred))

Confusion Martox: 
 [[82  3]
 [ 1 54]]
Accuracy Score is:  0.9714285714285714


Computing the accuracy with k-Fold Cross Validation

In [65]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
# {:.2f} means a float number with 2 decimal digit
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))

Accuracy: 96.60 %
Standard Deviation: 2.58 %
