# Homework

* Write your own implementation of k-means algorithm with random centroid initialization and 2 stopping conditions: max iterations and centroid convergence (if all attributes of all centroids changes not more than some epsilon the algorithm should stop). DONE, NOT TESTED
* Use your implementation to cluster data about cereal products with their dietary characteristics (cereals.csv, 16 attributes). 
* It contains some nominal attributes (name, mfr, type). You can omit the first two of them. Type attribute is binary, so you can replace it with values 0 and 1. DONE
* Perform the clustering of the cereals into 3 groups using k-means algorithm. 
* Remember to preprocess the data: normalization/standardization, attribute selection. DONE
* Try to describe the obtained groups based on the obtained centroids. What do all cereals within a certain group have in common?
* Write a report containing information about preprocessing methods that you used, number of cereals within each cluster, and your conclusions about the clustering results.

**Deadline +2 weeks**


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [None]:
sns.set_style('darkgrid')
plt.rc('figure', figsize=(12, 8), dpi=200)
eps = 1e-5

In [None]:
def dist_eu(a,b):
    return np.sqrt(np.sum((a-b)**2, axis=1))

def k_means(data_x:pd.DataFrame, groups:int=3, max_iter:int = 10):
    # centroids = data_x.sample(3)
    x_min = np.min(data_x, axis=0)
    x_max = np.max(data_x, axis=0)
    centroids = np.random.uniform(x_min, x_max, size=(groups, data_x.shape[1]))
    print(centroids)
    for it in range(max_iter):
        affiliation = np.argmin([dist_eu(data_x, cent) for cent in centroids], axis=0)
        new_centroids = np.array([np.mean(data_x[affiliation == i], axis=0)
                                 if np.sum(affiliation == i)
                                 else centroids[i]
                                 for i in range(groups)
                                 ])
        if np.all(np.abs(centroids - new_centroids) < eps):
            print(f"finished after {it} iterations due to lack of improvement")
            break
        centroids = new_centroids
    return affiliation, new_centroids 


def correlated_columns(data:pd.DataFrame, threshold:float):
    pairs = []
    for i, a in enumerate(data.columns):
        for j, b in enumerate(data.columns):
            if i < j:
                r = np.abs(np.corrcoef(data[a], data[b]))[0,1]
                if r > threshold:
                    pairs.append((a, b, r))
    return pairs

def drop_features(data:pd.DataFrame, threshold:float):
    for a, b, correlation in correlated_columns(data, threshold):
        if a in data.columns and b in data.columns: 
            if data[a].var() > data[b].var(): # variance
                data.drop(a, axis=1, inplace=True)
            else:
                data.drop(b, axis=1, inplace=True)

In [None]:
data = pd.read_csv("data/cereal.csv")
data = data.drop(['name', 'mfr'], axis=1)
data['type'] = data['type'].apply(lambda typ: 1 if typ == 'C' else 0)
print(data.head())
X_norm = StandardScaler().fit_transform(data)
X_norm = pd.DataFrame(X_norm, columns=data.columns)
drop_features(X_norm, threshold=0.8)
corr = X_norm.corr()
sns.heatmap(corr, mask=np.triu(np.ones_like(corr), k=0), annot=True, fmt='.3f', square=True, cmap='Blues')
plt.show()
print(X_norm.head())

In [None]:
np.random.seed(11)
affiliation, clusters = k_means(X_norm, max_iter=50)
data['cluster'] = affiliation
for aff, count in zip(*np.unique(affiliation, return_counts=True)):
    print(f'cluster {aff}: {count: 3d} items')