In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [88]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load and process dataset
load breast_cancer.csv, drop columns "id" and "Unnamed: 32", investigate the dataset, and divide into train and test with 80/20 ratio, map values of "diagnosis" from ("B","M") to (0,1)

In [89]:
original_data = pd.read_csv('breast_cancer.csv')
X = original_data.drop(['id', 'Unnamed: 32'], axis=1)
target_col = 'diagnosis'
X.loc[X[target_col] == 'M', 'diagnosis'] = 1
X.loc[X[target_col] == 'B', 'diagnosis'] = 0
X[target_col] = X[target_col].astype(int)

setting number of clusters

In [90]:
K = 2

We have two very strong outliers, which in future will make serious influence on algorithm and sometimes stick one cluster center and dont let to it to be changed
These are rows 212 and 461
We should delete them

In [91]:
X.drop(212, axis=0, inplace=True)
X.drop(461, axis=0, inplace=True)

In [92]:
print('CORRELATION MATRIX:')
for feature in X.columns.difference([target_col]):
    print(f'Correlation between {feature} and target column: ', X[[feature, target_col]].corr().iloc[1,0])

CORRELATION MATRIX:
Correlation between area_mean and target column:  0.7230194885685689
Correlation between area_se and target column:  0.6543415456204112
Correlation between area_worst and target column:  0.7448380011989271
Correlation between compactness_mean and target column:  0.5942507555534166
Correlation between compactness_se and target column:  0.290603028749923
Correlation between compactness_worst and target column:  0.5926000675832344
Correlation between concave points_mean and target column:  0.7775205070637204
Correlation between concave points_se and target column:  0.40507334289059854
Correlation between concave points_worst and target column:  0.7930748115655999
Correlation between concavity_mean and target column:  0.6964521916907342
Correlation between concavity_se and target column:  0.24916850970137372
Correlation between concavity_worst and target column:  0.658814461765053
Correlation between fractal_dimension_mean and target column:  -0.008297389429912686
Corre

In [93]:
y = X[target_col]
X.drop(target_col, axis=1, inplace=True)

In [94]:
print('Number of malignant diagnosis: ', y.value_counts().loc[0])
print('Number of benign diagnosis: ', y.value_counts().loc[1])

Number of malignant diagnosis:  357
Number of benign diagnosis:  210


In [95]:
X.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0
mean,14.079187,19.278783,91.631358,648.380776,0.096308,0.104091,0.087907,0.048513,0.181147,0.062823,0.397042,1.216239,2.804511,38.596116,0.007029,0.025424,0.031751,0.011767,0.0205,0.003792,16.213437,25.679894,106.864074,871.779189,0.132395,0.254144,0.271379,0.114266,0.290342,0.084013
std,3.43553,4.298475,23.664895,334.976006,0.01406,0.052719,0.078416,0.038259,0.027434,0.00706,0.241336,0.552502,1.735771,34.823434,0.002996,0.0179,0.03014,0.006152,0.0082,0.00265,4.743557,6.144992,32.934746,548.180273,0.02286,0.15739,0.208266,0.065526,0.061742,0.018048
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.000895,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.695,16.17,75.1,420.05,0.08621,0.06471,0.02952,0.02031,0.1619,0.0578,0.23235,0.8324,1.604,17.85,0.005163,0.013015,0.015035,0.007631,0.015095,0.002241,13.01,21.09,84.095,514.65,0.1166,0.1466,0.11445,0.06453,0.25045,0.071465
50%,13.34,18.84,86.18,546.4,0.09586,0.09242,0.06126,0.03341,0.1792,0.06155,0.3237,1.108,2.284,24.44,0.006369,0.02042,0.02586,0.01091,0.01873,0.003136,14.96,25.41,97.65,684.6,0.1313,0.2119,0.2264,0.09975,0.2823,0.08006
75%,15.765,21.79,103.75,781.8,0.1052,0.13035,0.12825,0.07352,0.19565,0.066135,0.47595,1.4735,3.321,44.935,0.008135,0.032295,0.041615,0.01471,0.023425,0.004537,18.655,29.69,125.05,1060.0,0.14605,0.3381,0.3819,0.16135,0.31815,0.092085
max,27.22,39.28,182.1,2250.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,1.509,4.885,11.07,233.0,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,33.13,49.54,229.3,3432.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [96]:
X.mean()

radius_mean                 14.079187
texture_mean                19.278783
perimeter_mean              91.631358
area_mean                  648.380776
smoothness_mean              0.096308
compactness_mean             0.104091
concavity_mean               0.087907
concave points_mean          0.048513
symmetry_mean                0.181147
fractal_dimension_mean       0.062823
radius_se                    0.397042
texture_se                   1.216239
perimeter_se                 2.804511
area_se                     38.596116
smoothness_se                0.007029
compactness_se               0.025424
concavity_se                 0.031751
concave points_se            0.011767
symmetry_se                  0.020500
fractal_dimension_se         0.003792
radius_worst                16.213437
texture_worst               25.679894
perimeter_worst            106.864074
area_worst                 871.779189
smoothness_worst             0.132395
compactness_worst            0.254144
concavity_wo

finding feature which should be rescaled

In [97]:
X_max = X.max()
features_to_rescale = X_max[np.abs(X.max()) > 2].index.tolist()
X[features_to_rescale] = StandardScaler().fit_transform(X[features_to_rescale])


## Implementing KMeans

In [98]:
class KMeans(object):
    def __init__(self, K, metric='L2', max_iter=200, eps=1e-4, center_init='random'):
        self.K = K
        self.max_iter = max_iter
        self.eps = eps
        self.centroids = np.array([])
        self.metric = metric.lower()
        self.center_init = center_init.lower()

        """
        if metric is 'L2' let self.dist be a function that computes euclidian distance between x and y vectors,
        if metric is 'L1' let self.dist be a function that computes manhattan distance between x and y vectors,
        otherwise raise not implemented error
        """
        if self.metric == 'l2':
            self.dist = self.l2_dist
        elif self.metric == 'l1':
            self.dist = self.l1_dist
        else:
            raise NotImplementedError

    def __str__(self):
        return f'KMeans object: metric={self.metric}, center_init={self.center_init}, K={self.K}, max_iter={self.max_iter}, eps={self.eps}'

    def distortion(self, X, r):
        """
        param X: numpy array of shape (M,N)
        param r: numpy array of shape (M), shows to which cluster each row of X belongs
        return: distortion value of the dataset
        """
        sum_ = 0
        for k in range(self.K):
            mask = r[:, k] == 1
            X_k = X[mask]
            sum_ += np.sum(self.dist(X_k, self.centroids[k]))
        print('distortion: ', sum_)
        return sum_

    def init_centroids(self, X):
        """
        :param X: numpy array of shape (M,N)
        """
        """ 
        If centers_init is 'random' initialize self.centroids with random K items from X,
        if it is 'kmeans++' initialize centroids according to the algorithm in 
        http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf page 3,
        otherwise raise not implemented error .
        """
        if self.center_init.lower() == 'random':
            self.centroids = self.random_init(X)
        elif self.center_init.lower() == 'kmeans++':
            self.centroids = self.kmeans_plus_plus_init(X)
        else:
            raise NotImplementedError

    def fit(self, X):
        """
        :param X: numpy array of shape (M,N)
        """
        """ 
        1. Initialize cluster centers using self.init_centroids method
        2. Implement KMeans algorithm and  terminate it when either self.max_iter iterations are performed,
        or the biggest change in cluster centers is smaller than selfk means formula.eps

        The final cluster centers should be saved in self.centroids
        """
        step = 0
        self.init_centroids(X)
        r = self.recalculate_r(X)
        curr_distortion = self.distortion(X, r)

        while step <= self.max_iter:
            r = self.recalculate_r(X)
            self.recalculate_centroids(X, r)

            prev_distortion = curr_distortion
            curr_distortion = self.distortion(X, r)
            if np.abs(prev_distortion - curr_distortion) <= self.eps:
                print(f'Required precision achieved on {step}-th step')
                break

            step += 1
        else:
            print('Maximum iterations run out!')

    def recalculate_centroids(self, X, r):
        for k in range(self.K):
            mask = r[:, k] == 1
            numerator = X[mask].sum(axis=0)
            denominator = r[:, k].sum()
            self.centroids[k] = numerator / denominator

    def recalculate_r(self, X):
        num_rows, num_columns = X.shape
        r = np.zeros(shape=(num_rows, self.K), dtype=int)
        indices = self.find_closest_distances(X, self.centroids)[:, 1].astype('int')
        for i in range(len(indices)):
            r[i, indices[i]] = 1
        return r

    def predict(self, X):
        """
        :param X: numpy array of shape (M,N)
        :return: numpy array of shape (M,)
        """
        """
        using  self.centroids predict to which cluster each datapoint of X belongs, values in returned array
        are integers(id of the cluster). 
        """
        return self.find_closest_distances(X, self.centroids)[:, 1].astype('int')

    def random_init(self, X):
        # for each feature define its boundaries, i.e. minimum and maximum values
        min_boundary = X.min(axis=0)
        max_boundary = X.max(axis=0)

        # return K random vectors of size X.shape[1]
        centroids = np.random.uniform(low=min_boundary, high=max_boundary, size=(self.K, min_boundary.shape[0]))
        return centroids

    def kmeans_plus_plus_init(self, X):
        num_rows, num_columns = X.shape
        # step 1a. Take one center c1, chosen uniformly at random from X
        centroids = np.array(X[np.random.randint(num_rows)])
        centroids = centroids.reshape(-1, len(centroids))

        # step2a.  Take a new center c[i], choosing x ∈ X with probability D(x)**2/sum(D(x)**2)
        for i in range(self.K - 1):
            distances = self.find_closest_distances(X, centroids)[:, 0]
            probabilities = self.get_probabilities(distances)
            max_proba_index = np.argwhere(probabilities == np.amax(probabilities))[0][0]

            # reshape 1d to 2d for appending
            new_centroid = X[max_proba_index].reshape(-1, len(X[max_proba_index]))
            centroids = np.append(centroids, new_centroid, axis=0)
        return centroids

    def get_probabilities(self, distances):
        squared = distances ** 2
        sum_ = np.sum(squared)
        return squared / sum_

    def find_closest_distances(self, X, centroids):
        '''
        :param X:
        :param centroids:
        :return: an array where i-th row is associated with i-th row in X
                 and has two elements: closest distance to centroid and index of that centroid
        '''

        num_rows = X.shape[0]
        closest_distances = np.zeros(shape=(num_rows, 2))

        for i in range(num_rows):
            # array of distances between current point and centroids
            distances = self.dist(centroids, X[i])
            # index of min element in distances assigned to indices array
            min_distance = np.amin(distances)
            closest_distances[i] = min_distance, np.argwhere(distances == min_distance)
        return closest_distances

    def l2_dist(self, X, Y):
        return np.sqrt(np.sum((X - Y) ** 2, axis=1))

    def l1_dist(self, X, Y):
        return np.sum(np.abs(X - Y), axis=1)
    
    

## Cluster the dataset with kmeans, model and predict malignancy of tumors in the test set entries
## 1. Perform clustering using the following hyperparameter pairs
1. metric='L1', center_init='random'
2. metric='L1', center_init='kmeans++'
3. metric='L2', center_init='random'
4. metric='L2', center_init='kmeans++'

## 2. Predict malignancy of tumors in the test set entries using all 4 models trained above, compare their performances.


In [99]:
clf1 = KMeans(K=2, metric='L1', center_init='random')
clf2 = KMeans(K=2, metric='L1', center_init='kmeans++')
clf3 = KMeans(K=2, metric='L2', center_init='random')
clf4 = KMeans(K=2, metric='L2', center_init='kmeans++')

In [100]:
print(clf1)
clf1.fit(X.values)
clusters = clf1.predict(X.values)

labels = np.zeros_like(clusters)
for i in range(2):
    mask = (clusters == i)
    labels[mask] = mode(y[mask])[0]
print(f'accuracy_score: ', accuracy_score(y, labels))

KMeans object: metric=l1, center_init=random, K=2, max_iter=200, eps=0.0001
distortion:  12979.37126814362
distortion:  5071.958889077876
distortion:  4114.00841895424
distortion:  3718.8185222248403
distortion:  3644.6172601562207
distortion:  3615.0103298062722
distortion:  3600.471961444086
distortion:  3596.1051262594005
distortion:  3596.1051262594005
Required precision achieved on 7-th step
accuracy_score:  0.8589065255731922


In [101]:
print(clf2)
clf2.fit(X.values)
clusters = clf2.predict(X.values)

labels = np.zeros_like(clusters)
for i in range(2):
    mask = (clusters == i)
    labels[mask] = mode(y[mask])[0]
print(f'accuracy score: ', accuracy_score(y, labels))

KMeans object: metric=l1, center_init=kmeans++, K=2, max_iter=200, eps=0.0001
distortion:  6034.62366609218
distortion:  4634.597479175499
distortion:  3932.464147812023
distortion:  3693.5453965169745
distortion:  3633.6072439600593
distortion:  3612.413435569223
distortion:  3598.13723774101
distortion:  3596.1051262594005
distortion:  3596.1051262594005
Required precision achieved on 7-th step
accuracy score:  0.8589065255731922


In [102]:
print(clf3)
clf3.fit(X.values)
clusters = clf3.predict(X.values)

labels = np.zeros_like(clusters)
for i in range(2):
    mask = (clusters == i)
    labels[mask] = mode(y[mask])[0]
print(f'accuracy score: ', accuracy_score(y, labels))

KMeans object: metric=l2, center_init=random, K=2, max_iter=200, eps=0.0001
distortion:  5578.647412985363
distortion:  1612.8061125461174
distortion:  1512.2280524348457
distortion:  1376.4038364461246
distortion:  1266.0673805249844
distortion:  1216.8786817007417
distortion:  1205.6770393131221
distortion:  1204.1622571071634
distortion:  1204.7129275867403
distortion:  1205.0710085939786
distortion:  1205.6852991491523
distortion:  1205.6852991491523
Required precision achieved on 10-th step
accuracy score:  0.8659611992945326


In [103]:
print(clf4)
clf4.fit(X.values)
clusters = clf4.predict(X.values)

labels = np.zeros_like(clusters)
for i in range(2):
    mask = (clusters == i)
    labels[mask] = mode(y[mask])[0]
print(f'accuracy score: ', accuracy_score(y, labels))

KMeans object: metric=l2, center_init=kmeans++, K=2, max_iter=200, eps=0.0001
distortion:  1818.38232508054
distortion:  1499.4790851885618
distortion:  1301.4349756481806
distortion:  1232.4626658569848
distortion:  1212.8598854927616
distortion:  1209.2607644235163
distortion:  1207.112040091141
distortion:  1207.112040091141
Required precision achieved on 6-th step
accuracy score:  0.8624338624338624


## Fit your implementation of Logistic Regression on the dataset, predict on test set and compare the results with kmeans approach

## Analyze the coefficients of fitted logistic regression model, drop 2 most unimportant features and train again Logistic regression and Kmeans with best metric, center_init hyperparameters, evaluate and compare results

## Analyze the coefficients of fitted initial logistic regression model(using all features), select two most important features and train again Logistic regression and Kmeans with best metric, center_init hyperparameters, evaluate and compare results, make the following plot using the test set:

datapoints with cluster centers and decision boundary, color the datapoints according to Kmeans predictions
color the datapoints on which predictions of logistic regression and Kmeans disagree with separate color


## Compare performance of best Kmeans model with the performance of Kmeans in sklearn library, using the same hyperparameters.