# Data Pre-processing and Visualization

In [7]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import plotly as py
import plotly.subplots as subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Loading Data

We will load the featurized superconducting data, and their chemical compositions. We will do a rough determination of whether a compound is cuprate or not based on whether it has both $Cu$ and $O$.

In [8]:
X = pd.read_csv('data/superconductors/train.csv')
composition = pd.read_csv('data/superconductors/unique_m.csv')[['Cu', 'O']]
is_cuprate = pd.DataFrame(data={'is_cuprate': composition['Cu'] * composition['O']}, dtype=float)
is_cuprate = is_cuprate > 0
cuprate_X = X[is_cuprate['is_cuprate']==True]
not_cuprate_X = X[is_cuprate['is_cuprate']==False]

## Clustering Data using KMeans
We cannot in general separate all compounds into classes of superconductors as defined in the literature. Therefore, we use KMeans clustering on the chemical composition to create artificial classes.

In [114]:
scaler = StandardScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X.drop(["critical_temp"],axis=1)))

n_clusters = 15
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(scaled_X)
clusters = kmeans.predict(scaled_X)
X["Cluster"] = clusters
scaled_X["Cluster"] = clusters
scaled_X = pd.concat([scaled_X, is_cuprate], axis=1, join='inner')

## Data Visualization Using PCA
We compare the clustering results to cuprates and non-cuprates to see how well the clustering follows human convention in this one specific way.

In [81]:
plotX = pd.DataFrame(np.array(scaled_X.sample(5000, random_state=10)))
plotX.columns = scaled_X.columns
pca_2d = PCA(n_components=2)
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster","is_cuprate"], axis=1)))
PCs_2d.columns = ["PC1_2d", "PC2_2d"]
plotX = pd.concat([plotX, PCs_2d], axis=1, join='inner')
clusters = [plotX[plotX["Cluster"] == i] for i in range(n_clusters)]

In [90]:
fig = py.subplots.make_subplots(rows=1, cols=2,
    subplot_titles=("PCA Visualization KMeans", "PCA Visualization Cuprate Categorization"))

for i in range(n_clusters):
    fig.add_trace( go.Scatter(
                        x = clusters[i]["PC1_2d"],
                        y = clusters[i]["PC2_2d"],
                        mode = "markers",
                        name = "Cluster {}".format(i),
                        text = None),
                        row=1, col=1
                )
for cuprate_or_not in ['Is Cuprate', 'Is NOT Cuprate']:
    fig.add_trace( go.Scatter(
                        x = plotX[plotX['is_cuprate'] == (cuprate_or_not=='Is Cuprate')]["PC1_2d"],
                        y = plotX[plotX['is_cuprate'] == (cuprate_or_not=='Is Cuprate')]["PC2_2d"],
                        mode = "markers",
                        name = cuprate_or_not,
                        text = None),
                        row=1, col=2
                )

fig.update_layout(
    xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
    yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False),
    autosize=False,
    width=1000,
    height=500)
fig.show()

We see that cluster 1 and 2 obtained using KMeans clustering are roughly cuprates. However, this is rather confusing. We therefore need to use a better visualization tool.

## Data Visualization Using t-SNE
Results obtained via PCA is rather confusing. t-SNE is a more sophisticated method. We illustrate that here.

In [93]:
plotX = pd.DataFrame(np.array(scaled_X.sample(5000)))
plotX.columns = scaled_X.columns
#Set our perplexity
perplexity = 50
#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)
#This DataFrame contains two dimensions, built by T-SNE
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plotX.drop(["Cluster","is_cuprate"], axis=1)))
TCs_2d.columns = ["TC1_2d","TC2_2d"]
plotX = pd.concat([plotX, TCs_2d], axis=1, join='inner')
clusters = [plotX[plotX["Cluster"] == i] for i in range(n_clusters)]


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [94]:
fig = py.subplots.make_subplots(rows=1, cols=2,
    subplot_titles=("t_SNE Visualization KMeans", "t_SNE Visualization Cuprate Categorization"))

for i in range(n_clusters):
    fig.add_trace( go.Scatter(
                        x = clusters[i]["TC1_2d"],
                        y = clusters[i]["TC2_2d"],
                        mode = "markers",
                        name = "Cluster {}".format(i),
                        text = None),
                        row=1, col=1
                )
for cuprate_or_not in ['Is Cuprate', 'Is NOT Cuprate']:
    fig.add_trace( go.Scatter(
                        x = plotX[plotX['is_cuprate'] == (cuprate_or_not=='Is Cuprate')]["TC1_2d"],
                        y = plotX[plotX['is_cuprate'] == (cuprate_or_not=='Is Cuprate')]["TC2_2d"],
                        mode = "markers",
                        name = cuprate_or_not,
                        text = None),
                        row=1, col=2
                )

fig.update_layout(
    xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
    yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False),
    autosize=False,
    width=1000,
    height=500)
fig.show()

With t-SNE, it is clear that cuprates correspond to cluster 1, 2 and 11 and there are few mis-categorizations.

## Verify the quality of KMeans Clustering

In [110]:
n_cuprate = plotX[plotX['is_cuprate']==True].shape[0]
n_selected = plotX[plotX['Cluster'].isin([1, 2, 11])].shape[0]
n_correct = plotX.query('is_cuprate==True & Cluster.isin([1, 2, 11])').shape[0]
print('Rate of correcting selecting cuprate: {}, rate of false labeling {}.'.format(n_correct/n_cuprate, (n_selected-n_cuprate)/n_selected))

Rate of correcting selecting cuprate: 0.9845465636437576, rate of false labeling 0.008067769261799113.


## Redo Clustering on Cuprates
Since we will hold cuprates from training, we need to redo clustering on non-cuprate families.

In [12]:
scaler = StandardScaler()


scaled_cuprate_X = pd.DataFrame(scaler.fit_transform(cuprate_X.drop(["critical_temp"],axis=1)))
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(scaled_cuprate_X)
cuprate_clusters = kmeans.predict(scaled_cuprate_X)
cuprate_X["Cluster"] = cuprate_clusters
scaled_cuprate_X["Cluster"] = cuprate_clusters


scaled_X = pd.DataFrame(scaler.fit_transform(not_cuprate_X.drop(["critical_temp"],axis=1)))
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(scaled_X)
clusters = kmeans.predict(scaled_X)
not_cuprate_X["Cluster"] = clusters
scaled_X["Cluster"] = clusters



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [4]:
cuprate_plotX = pd.DataFrame(np.array(scaled_cuprate_X.sample(5000)))
cuprate_plotX.columns = scaled_cuprate_X.columns
#Set our perplexity
perplexity = 50
#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)
#This DataFrame contains two dimensions, built by T-SNE
cuprate_TCs_2d = pd.DataFrame(tsne_2d.fit_transform(cuprate_plotX.drop(["Cluster"], axis=1)))
cuprate_TCs_2d.columns = ["TC1_2d","TC2_2d"]
cuprate_plotX = pd.concat([cuprate_plotX, cuprate_TCs_2d], axis=1, join='inner')
cuprate_clusters = [cuprate_plotX[cuprate_plotX["Cluster"] == i] for i in range(n_clusters)]



plotX = pd.DataFrame(np.array(scaled_X.sample(5000)))
plotX.columns = scaled_X.columns
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))
TCs_2d.columns = ["TC1_2d","TC2_2d"]
plotX = pd.concat([plotX, TCs_2d], axis=1, join='inner')
clusters = [plotX[plotX["Cluster"] == i] for i in range(n_clusters)]


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [9]:
fig = subplots.make_subplots(rows=1, cols=2,
    subplot_titles=("Non-Cuprates", "Cuprates"))

for i in range(n_clusters):
    fig.add_trace( go.Scatter(
                        x = clusters[i]["TC1_2d"],
                        y = clusters[i]["TC2_2d"],
                        mode = "markers",
                        name = "Cluster {}".format(i),
                        text = None),
                        row=1, col=1
                )
    fig.add_trace( go.Scatter(
                        x = cuprate_clusters[i]["TC1_2d"],
                        y = cuprate_clusters[i]["TC2_2d"],
                        mode = "markers",
                        name = "Cluster {}".format(i),
                        text = None),
                        row=1, col=2
                )

fig.update_layout(
    xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
    yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False),
    autosize=False,
    width=1000,
    height=500)
fig.show()

## Saving Clustering Results

In [10]:
not_cuprate_X.to_csv('data/superconductors/not_cuprate.csv')
cuprate_X.to_csv('data/superconductors/cuprate.csv')