# Logistic Regression - Breast Cancer
### Dataset found under https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29

## Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score, precision_recall_curve

## Dataset

In [3]:
df = pd.read_csv('breast_cancer.csv')

## Checking data quality

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           683 non-null    int64
 1   Clump Thickness              683 non-null    int64
 2   Uniformity of Cell Size      683 non-null    int64
 3   Uniformity of Cell Shape     683 non-null    int64
 4   Marginal Adhesion            683 non-null    int64
 5   Single Epithelial Cell Size  683 non-null    int64
 6   Bare Nuclei                  683 non-null    int64
 7   Bland Chromatin              683 non-null    int64
 8   Normal Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(11)
memory usage: 58.8 KB


In [5]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
df.describe()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,1076720.0,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,620644.0,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,877617.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171795.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238705.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


## Preparing data for logistic regression

In [7]:
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1:]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Creating and using Classifier

In [9]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
y_pred = log_clf.predict(X_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Getting metrics

In [10]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[78  1]
 [ 5 53]]


In [11]:
acc_score = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label = 2)
recall = recall_score(y_test, y_pred, pos_label = 2)
f1 = f1_score(y_test, y_pred, pos_label= 2)
print('Accuracy is {:.2f} %'.format(acc_score * 100))
print('Precision is {:.2f} %'.format(prec * 100))
print('Recall is {:.2f} %'.format(recall * 100))
print('F1-Score is is {:.2f} %'.format(f1 * 100))

Accuracy is 95.62 %
Precision is 93.98 %
Recall is 98.73 %
F1-Score is is 96.30 %


## Using k-Fold Cross Validation for accuracy

In [12]:
accuracies = cross_val_score(estimator = log_clf, X = X_train, y = y_train, cv = 10)
print('Average accuracy is {:.2f} %'.format(accuracies.mean() * 100))
print('Standard deviation is {:.2f} %'.format(accuracies.std() * 100))

Average accuracy is 96.71 %
Standard deviation is 2.13 %
