In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.io import arff
from scipy.spatial.distance import cdist
from time import time
import itertools

from matplotlib.colors import ListedColormap
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score, accuracy_score

from sklearn.datasets import load_iris, load_digits, load_breast_cancer

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import Isomap

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

In [None]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

colors = ['#0D76BF', '#00cc96', '#EF553B']

In [None]:
def get_pd_df(data, label='label'):
    X = data.data
    y = data.target
    df_X = pd.DataFrame(data=X, columns=data.feature_names)
    df_y = pd.DataFrame(data=y, columns=[label])
    df = pd.concat([df_X, df_y], axis=1)
    return df, df_X, df_y

def plot_decn_bdry(X, y, model_class, **model_params):
    """Function to plot the decision boundaries of a classification model.
    """
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    pca = PCA(n_components=2)
    X_t = pca.fit_transform(X)
    
    model = model_class(**model_params)
    model.fit(X, y)

    # Step size of the mesh
    h = .02 

    # Plot the decision boundary in 2 dimensions
    x_min, x_max = X_t[:, 0].min() - .5, X_t[:, 0].max() + .5
    y_min, y_max = X_t[:, 1].min() - .5, X_t[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh using the model.
    dstacked = np.dstack((xx, yy))
    shape_t = (xx.shape[0] * xx.shape[1], X.shape[1])
    predict_data = pca.inverse_transform(dstacked).reshape(shape_t)
    Z = model.predict(predict_data).reshape(xx.shape) 

    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X_t[:, 0], X_t[:, 1], c=y, cmap=plt.cm.Spectral)
    return plt

def get_freq_plot(df, col):
    val_cnt_df = pd.DataFrame(df[col].value_counts())
    freq_data = val_cnt_df.reset_index().rename(columns={'index': 'value', col:'count'})
    ax = sns.barplot(x="value", y="count",data=freq_data)
    
def set_split(df, label):
    X = df[[col for col in df.columns if col != label]]
    y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

def train_test_score(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    return train_score, test_score

In [None]:
data = load_breast_cancer()
label = 'diagnosis'
cols_X = data.feature_names
# X, y = data.data, data.target
# print(X.shape, y.shape)
print(list(data.target_names))
df, X, y = get_pd_df(data, label=label)
print(df.shape)

In [None]:
df_red = pd.read_csv('wine-quality/winequality-red.csv', sep=';')
df_red['is_red_wine'] = 1
df_white = pd.read_csv('wine-quality/winequality-white.csv', sep=';')
df_white['is_red_wine'] = 0
df = pd.concat([df_red, df_white], axis = 0).reset_index(drop=True)
label = 'is_red_wine'
print(df.shape)
print(df.head())

cols_X = [col for col in df.columns if col != label]
X = df[cols_X]
y = df[label]

## 1. Clustering 

In [None]:
def plot_clusters(range_n_clusters, estimator):
    ssd, sc = [], []
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1) = plt.subplots(1, 1)
        fig.set_size_inches(10, 5)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        
        clusterer = estimator(n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)
        if clusterer.__class__.__name__ == 'KMeans':
            ssd_n = clusterer.inertia_
        else:     
            ssd_n = sum(np.min(cdist(X, clusterer.means_, 'euclidean'), axis=1)) / X.shape[0]
        ssd.append(ssd_n)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        sc.append(silhouette_avg)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

    # plt.show()
    return(ssd, sc)

In [None]:
# pipe = make_pipeline(StandardScaler(), KMeans(n_clusters=4))
# X_c = pipe.fit_predict(X)

In [None]:
range_n_clusters = list(range(2, 10)) + list(range(10, 40, 5))
ssd, sc = plot_clusters(range_n_clusters, GaussianMixture)

In [None]:
plt.plot(range_n_clusters, ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
plt.plot(range_n_clusters, sc, 'bx-')
plt.xlabel('k')
plt.ylabel('SC')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
cv_types = ['spherical', 'tied', 'diag', 'full']
n_components_range = range_n_clusters
sc, ssd = [], []

fig = plt.figure(figsize=(12,4))

for cv_type in cv_types:
    sc, ssd = [], []
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X)
        cluster_labels = gmm.predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        ssd_n = sum(np.min(cdist(X, gmm.means_, 'euclidean'), axis=1)) / X.shape[0]
        sc.append(silhouette_avg)
        ssd.append(ssd_n)
    
    plt1 = plt.subplot(1, 2, 2)
    plt1.plot(n_components_range, sc, 'x-', label=cv_type)
    plt1.legend(loc='upper right')
    plt1.set_xlabel('number of clusters')
    plt1.set_ylabel('Silhoutte Score')
    
    plt2 = plt.subplot(1, 2, 1)
    plt2.plot(n_components_range, ssd, 'x-', label=cv_type)
    plt2.legend(loc='upper right')
    plt2.set_xlabel('number of clusters')
    plt2.set_ylabel('Sum of Squared Error')

plt.subplots_adjust(hspace=.35, bottom=.02)
plt.show()

In [None]:
# Reduce down to 2 dimensions and visualize between labels
n_clusters = 3
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)

In [None]:
pd.DataFrame(X_t, columns=['x', 'y']).shape

In [None]:
def visualize_reduced_to_2d(pipeline, X, y):
    X_t = pipeline.fit_transform(X)
    X_t_y_df = pd.concat([pd.DataFrame(X_t, columns=['x', 'y']), y], axis=1)
    trace = go.Scattergl(
        x = X_t_y_df.x, 
        y = X_t_y_df.y,
        mode = 'markers',
        marker=dict(
            color = X_t_y_df[label],
            colorscale = [[0, 'green'], [1, 'red']],
            showscale = True,
            opacity = 0.8
        )
    ) 

    # Plot and embed in ipython notebook!
    iplot([trace], filename='basic-scatter')

In [None]:
t0 =time()
pca_pipe = make_pipeline(StandardScaler(), PCA(n_components=2))
visualize_reduced_to_2d(pca_pipe, X, y)
print(time() - t0)

In [None]:
trace = go.Scattergl(
    x = X_t_y_df.x, 
    y = X_t_y_df.y,
    mode = 'markers',
    marker = dict(
        color = cluster_labels,
        symbol = X_t_y_df[label],
        colorscale = 'Jet',
        showscale = True,
        opacity = 0.8
    )
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [None]:
df_cluster = pd.DataFrame(cluster_labels, columns=['cluster'])
df_w_cluster = pd.concat([df, df_cluster], axis=1)
df_w_cluster.shape

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=cols_X)
df_std_w_cluster = pd.concat([X_std, y, df_cluster], axis=1)

In [None]:
df_plot = df_std_w_cluster
df_plot_mean = df_plot.groupby(['cluster']).mean()
df_plot_std = df_plot.groupby(['cluster']).std()
df_plot_mean_plus_std = df_plot + df_w_cluster_std
df_plot_mean_minus_std = df_plot - df_w_cluster_std

In [None]:
traces = [go.Scatter(
    x = df_plot_mean.columns,
    y = df_plot_mean.iloc[i,:],
    mode = 'markers',
    name = 'cluster {}'.format(i),
    marker = dict(opacity = 0.8)
) for i in range(n_clusters)]


data = traces
# data = [trace0, trace1, trace2]

iplot(data, filename='line-mode')

In [None]:
data = [
    {
        'y': df_plot[df_plot.cluster == i].values.flatten(order='F'),
        'x': np.repeat(df_plot.columns, df_plot[df_plot.cluster == i].shape[0]),
        'name' : 'cluster {}'.format(i),
        'marker': {
            'opacity': .9
        },
        'boxmean': True,
        'boxpoints' : False,
        'orientation': 'v',
        "type": "box",
    } for i in range(n_clusters)
]
layout = {
    'yaxis': {
        'title': 'features',
        'zeroline': False,
        'range': [-2, 5]
    },
    'boxmode': 'group'

}
fig = go.Figure(data=data, layout=layout)

iplot(fig)

## 2. Dimensionality Reduction

In [None]:
def recreate_data_w_dim_reducer(X, estimator, n_components):
    scaler = StandardScaler()
    X_s = scaler.fit_transform(X)

    t0 = time()
    transformer = estimator(n_components=n_components)
    X_t = transformer.fit_transform(X_s)
    print("done in %0.3fs" % (time() - t0))

    X_r = transformer.inverse_transform(X_t)
    X_r = scaler.inverse_transform(X_r)
    
    df_r = pd.DataFrame(X_r, columns=X.columns)
    return {'df':df_r, 'components':transformer.components_}

### 2.1. PCA

In [None]:
X.shape

In [None]:
X_new = recreate_data_w_dim_reducer(X, PCA, 2)['df']

In [None]:
X_new.shape

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=cols_X)
data = [go.Box(y=X_std[col], name=col) for col in X_std.columns.tolist()]
iplot(data)

In [None]:
scaler = StandardScaler()
X_new_std = scaler.fit_transform(X_new)
X_new_std = pd.DataFrame(X_new_std, columns=cols_X)
data = [go.Box(y=X_new_std[col], name=col) for col in X_new.columns.tolist()]
iplot(data)

In [None]:
data = [
    {
        'y': df[df.cluster == i].values.flatten(order='F'),
        'x': np.repeat(df_plot.columns, df_plot[df_plot.cluster == i].shape[0]),
        'name' : 'cluster {}'.format(i),
        'marker': {
            'opacity': .9
        },
        'boxmean': True,
        'boxpoints' : False,
        'orientation': 'v',
        "type": "box",
    } for df in [df1, df2]
]
layout = {
    'yaxis': {
        'title': 'features',
        'zeroline': False,
        'range': [-2, 5]
    },
    'boxmode': 'group'

}
fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
X.describe()

In [None]:
X_new.describe()

In [None]:
# proportion of variance explained
scaler = StandardScaler()
X_s = scaler.fit_transform(X)

t0 = time()
pca = PCA()
X_t = pca.fit_transform(X_s)
plt.plot(transformer.explained_variance_ratio_, 'bx-')
# shoulder at 3 or 4 then at 7

In [None]:
eig_vals = transformer.explained_variance_
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = dict(
    type='bar',
    x=['PC %s' %i for i in range(1,10)],
    y=var_exp,
    name='Individual'
)

trace2 = dict(
    type='scatter',
    x=['PC %s' %i for i in range(1,10)], 
    y=cum_var_exp,
    name='Cumulative'
)

data = [trace1, trace2]

layout=dict(
    title='Explained variance by different principal components',
    yaxis=dict(
        title='Explained variance in percent'
    )
)

fig = dict(data=data, layout=layout)
iplot(fig, filename='selecting-principal-components')

### 2.2. ICA

In [None]:
ica = FastICA()
X_t = ica.fit_transform(X_s)

In [None]:
t0 = time()
pca_pipe = make_pipeline(StandardScaler(), FastICA(n_components=2))
visualize_reduced_to_2d(pca_pipe, X, y)
print(time() - t0)

In [None]:
n_samples = X.shape[0]
# We center the data and compute the sample covariance matrix.
X -= np.mean(X, axis=0)
cov_matrix = np.dot(X.T, X) / n_samples
for eigenvector in ica.components_:
    print(np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)))

### 2.3. RP

In [None]:
t0 = time()
rp_pipe = make_pipeline(StandardScaler(), GaussianRandomProjection(n_components=2, random_state=12))
visualize_reduced_to_2d(rp_pipe, X, y)
print(time() - t0)

### 2.4. Isomap

In [None]:
t0 = time()
pipe = make_pipeline(StandardScaler(), Isomap(n_components=2))
visualize_reduced_to_2d(pipe, X, y)
print(time() - t0)

## 3. Dim Reduce then Clustering

In [None]:
t0 = time()
X_s = StandardScaler().fit_transform(X)
X_t = PCA(n_components=2).fit_transform(X_s)
# evaluate how good are the clustering? Same clusters as before? Why or why not?
X_c = KMeans(n_clusters=3).fit_predict(X_t)
print(time() - t0)

In [None]:
trace = go.Scattergl(
    x = X_t[:,0], 
    y = X_t[:,1],
    mode = 'markers',
    marker = dict(
        color = X_c,
        symbol = y,
        colorscale = 'Jet',
        showscale = True,
        opacity = 0.8
    )
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

## 4. Dim Reduce then NN

In [None]:
# Performance and Run speed reduction etc

In [None]:
estimator_pipeline = make_pipeline(StandardScaler(), PCA(n_components=7), MLPClassifier(hidden_layer_sizes=[16,]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
estimator_pipeline.fit(X_train, y_train)
y_pred = estimator_pipeline.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
df = pd.read_csv('wine-quality/winequality-red.csv', sep=';')
label = 'quality'
print(df.shape)
df.head()

In [None]:
if label == 'quality':
    df[label] = df[label].map(lambda x: 0 if x in [3, 4, 5] else 1)
get_freq_plot(df, label)

In [None]:
X_train, X_test, y_train, y_test = set_split(df, 'quality')
estimator_pipeline.fit(X_train, y_train)
y_pred = estimator_pipeline.predict(X_test)

accuracy_score(y_test, y_pred)

## 5. Dim Reduce then Cluster then NN 