In [46]:
# BASIC
import numpy as np
import pandas as pd

# SCIKIT
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.decomposition import PCA

In [12]:
raw_data = pd.read_csv('../mlcourse.ai/data/mlbootcamp5_train.csv', 
                       sep=';', index_col='id')
print(raw_data.info())
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
age            70000 non-null int64
gender         70000 non-null int64
height         70000 non-null int64
weight         70000 non-null float64
ap_hi          70000 non-null int64
ap_lo          70000 non-null int64
cholesterol    70000 non-null int64
gluc           70000 non-null int64
smoke          70000 non-null int64
alco           70000 non-null int64
active         70000 non-null int64
cardio         70000 non-null int64
dtypes: float64(1), int64(11)
memory usage: 6.9 MB
None


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


**Observations:**
1. Age is in days --> change to years as 'int'
2. Gender = 1 (female) or 2 (male) --> change to 0 and 1, respectively

In [26]:
data = raw_data.copy()
data['age'] = (raw_data.age/365.25).astype('int64')
data['gender'] = data.gender - 1
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
age            70000 non-null int64
gender         70000 non-null int64
height         70000 non-null int64
weight         70000 non-null float64
ap_hi          70000 non-null int64
ap_lo          70000 non-null int64
cholesterol    70000 non-null int64
gluc           70000 non-null int64
smoke          70000 non-null int64
alco           70000 non-null int64
active         70000 non-null int64
cardio         70000 non-null int64
dtypes: float64(1), int64(11)
memory usage: 6.9 MB
None


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,50,1,168,62.0,110,80,1,1,0,0,1,0
1,55,0,156,85.0,140,90,3,1,0,0,1,1
2,51,0,165,64.0,130,70,3,1,0,0,0,1
3,48,1,169,82.0,150,100,1,1,0,0,1,1
4,47,0,156,56.0,100,60,1,1,0,0,0,0


In [41]:
for col in data.columns[-6:]:
    print(f'{data[col].nunique()} unique values in {col}')

3 unique values in cholesterol
3 unique values in gluc
2 unique values in smoke
2 unique values in alco
2 unique values in active
2 unique values in cardio


In [32]:
def encode_features(df):
"""This function takes a dataframe and implements one-hot encoding to features 
that have more than 3 unique values. 

Returns an encoded dataframe."""

    new_df = df.copy()
    for col in df.columns[-6:]:
        if df[col].nunique() > 2:
            new_df = pd.get_dummies(new_df, columns=[col], prefix=col[:4])
    return new_df

In [33]:
encode_features(data).head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,chol_1,chol_2,chol_3,gluc_1,gluc_2,gluc_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,50,1,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,55,0,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0
2,51,0,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0
3,48,1,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0
4,47,0,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0


Pop 'cardio' column from data to use as training labels

In [42]:
cardio = data.pop('cardio')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(data, cardio, test_size=0.2)

In [48]:
def kmeans(df, n_clusters):
    km = KMeans(n_clusters=n_clusters)
    labels = km.fit(df).labels_
    centroids = km.cluster_centers_
    
    return labels, centroids
    

In [50]:
train_clusters, train_centroids = kmeans(X_train, 8)

In [57]:
X_train.iloc[train_clusters==1]

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
17260,58,1,169,78.0,130,9011,1,1,1,1,1
47030,50,0,156,65.0,150,9011,2,2,0,0,1
13066,52,0,165,71.0,120,8000,1,1,0,0,1
54286,43,1,174,70.0,130,8099,1,1,0,0,0
4208,55,0,168,78.0,140,8044,3,3,0,0,0
3352,57,1,186,105.0,140,10000,1,1,0,0,1
62921,58,0,165,74.0,120,8200,2,3,0,0,1
34098,49,1,169,86.0,150,10000,2,2,0,1,1
22832,39,1,179,70.0,120,8500,1,1,0,0,1
53083,56,1,176,80.0,140,8099,1,1,0,0,1
