#### Required Libraries

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder 

In [2]:
bc = pd.read_csv("data/cancer.data", header=None)
bc.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2


#### Renaming Columns

In [3]:
bc.rename(columns = 
          {0:'ID number',
           1:'Clump Thickness', 
           2:'Cell Size Uniformity',
           3:'Cell Shape Uniformity', 
           4:'Marginal Adhesion',
           5:'Single Epithelial Cell Size',
           6:'Bare Nuclei',
           7:'Bland Chromatin',
           8:'Normal Nucleoli',
           9:'Mitoses', 
           10:'Class'
}, inplace=True)

In [4]:
bc.head(3)

Unnamed: 0,ID number,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2


#### Inspecting Data

In [5]:
bc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
ID number                      699 non-null int64
Clump Thickness                699 non-null int64
Cell Size Uniformity           699 non-null int64
Cell Shape Uniformity          699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [6]:
bc.describe()

Unnamed: 0,ID number,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


#### Splitting Data

In [7]:
# bc.info()

In [8]:
bc = bc.replace('?', np.nan)

In [9]:
# Iterate over each column of bc
for col in bc:
    # Check if the column is of object type
    if bc[col].dtypes == 'object':
        # Impute with the most frequent value
        bc = bc.fillna(bc[col].value_counts().index[0])

bc.isnull().sum()

ID number                      0
Clump Thickness                0
Cell Size Uniformity           0
Cell Shape Uniformity          0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder 

le =  LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in bc:
    if bc[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        bc[col]=le.fit_transform(bc[col])

In [11]:
bc = bc.values

In [12]:
# Segregate features and labels into separate variables
X,y = bc[:,0:10] , bc[:,10]

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and rescale X
mmscaler = MinMaxScaler(feature_range=(0, 1))
X_transformed = mmscaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

#### Building Logistic Model

In [20]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
y_pred = model.predict(X_test)

# Get accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(X_test, y_test))

Accuracy of logistic regression classifier:  0.9666666666666667


In [23]:
from sklearn.metrics import confusion_matrix
# confusion matrix of the logreg model
confusion_matrix(y_pred, y_test)

array([[140,   4],
       [  3,  63]])

In [26]:
from sklearn.metrics import classification_report
label= ['Benign', 'Malignant']
print(classification_report(y_test, y_pred, target_names= label))

              precision    recall  f1-score   support

      Benign       0.97      0.98      0.98       143
   Malignant       0.95      0.94      0.95        67

    accuracy                           0.97       210
   macro avg       0.96      0.96      0.96       210
weighted avg       0.97      0.97      0.97       210



#### Overdispersion

verdispersion is the presence of greater variability (statistical dispersion) in a data set than would be expected based on a given statistical model.