# Mice Protein Expression 

Genetic expression of proteins in the cerebral cortex, for control and Down syndrome mice.


## Goal

Predict the class of the mouse given the activity level of proteins

#### Classes:

- c-CS-s: control mice, stimulated to learn, injected with saline (9 mice)
- c-CS-m: control mice, stimulated to learn, injected with memantine (10 mice)
- c-SC-s: control mice, not stimulated to learn, injected with saline (9 mice)
- c-SC-m: control mice, not stimulated to learn, injected with memantine (10 mice)

- t-CS-s: trisomy mice, stimulated to learn, injected with saline (7 mice)
- t-CS-m: trisomy mice, stimulated to learn, injected with memantine (9 mice)
- t-SC-s: trisomy mice, not stimulated to learn, injected with saline (9 mice)
- t-SC-m: trisomy mice, not stimulated to learn, injected with memantine (9 mice)



[Repository](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline


In [None]:
fname = "./Data_Cortex_Nuclear.xls"

In [None]:
data = pd.read_excel(fname)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
plt.imshow(data.corr())

---

#### Separate categorical data from real valued data

In [None]:
numeric_columns     = data.describe().columns
numeric_no_miss     = numeric_columns & data.T.dropna().T.columns
categorical_columns = data.columns ^ numeric_columns
categorical_columns

#### Separate incomplete with missing values

In [None]:
index_comlete = data.dropna().index
index_missing = data.index ^ index_comlete

print("Total entries: {}".format(len(data)))
print("\t - with no missing values: {}".format(len(index_comlete)))
print("\t - with some missing values: {}".format(len(index_missing)))


#### *One-hot-encoding*: transforming categorical data to 0-1

In [None]:
labels = data["class"]
labels.head()

In [None]:
pd_Y = pd.get_dummies(data["class"])
pd_Y

#### Get available class

In [None]:
class_set = data["class"].drop_duplicates().values
n_class = len(class_set)
class_set

---


# kNN: k-nearest neighbor


The idea is to find, for a given point, its nearest k-neighbors.


![](KnnClassification.png)



## Work on KNN

In [None]:
X       = data[numeric_columns].loc[index_comlete].values
y       = pd_Y.loc[index_comlete].values
y_label = labels.loc[index_comlete].values

##### Normalize

In [None]:
X = X / X.std(axis=0)

#### Display dimension

In [None]:
import holoviews as hv
import holoviews.util
hv.extension('bokeh')

from bokeh.io import show
from bokeh.plotting import figure, gridplot
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Viridis256

##### Map class to color

In [None]:
colors_dic = {}

for idx, c in enumerate(class_set):
    colors_dic[c] = Viridis256[int(255*idx/n_class)]
    
data["Color"] = data['class'].apply(lambda x: colors_dic[x])

##### Tool for poping up information when puting cursor above point

In [None]:
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ("class", "@class"),
])

source = ColumnDataSource(data=data[list(numeric_columns) + ["Color", "class"]])

#### Select dimension you are interested in

In [None]:
d0, d1 = 0, 1

#### Column name

In [None]:
col0 = numeric_columns[d0]
col1 = numeric_columns[d1]
print(col0, col1)

#### Draw a figure

In [None]:
f = figure(width=800, height=800,
          x_axis_label=col0,
          y_axis_label=col1,
          title="Visualisation of data")

f.circle(x=col0, y=col1, source=source, fill_color="Color", size=15, alpha=0.6)

f.add_tools(hover)

show(f)

##### Plot components separately

In [None]:
fig, ax = plt.subplots(1, n_class, figsize=(5*n_class, 5))

for idx, name in enumerate(class_set) :    
    m = y_label == name
    
    ax[idx].plot(X.T[d0][m], X.T[d1][m], "o", color = colors_dic[name])
    ax[idx].set_title(name)


---

## Use KNN

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

##### Split dataset into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
y_train_labels = class_set[y_train.argmax(axis=1)]
y_test_labels  = class_set[y_test.argmax(axis=1)]

##### Fit the model

In [None]:
md_nn = NearestNeighbors(n_neighbors=5)

md_nn.fit(X_train)

##### Get position of nearest points

Find k nearest train points of a test item 

In [None]:
neighbors = md_nn.kneighbors(X_test, return_distance=False)

In [None]:
neighbors

In [None]:
class_set[y_train[neighbors[0]].argmax(axis=1)]

##### Get the class

In [None]:
y_test_pred_labels = class_set[y_train[neighbors].sum(axis=1).argmax(axis=1)]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
        
    fig, ax = plt.subplots(figsize=(10,10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
# score
print(accuracy_score(y_test_labels, y_test_pred_labels))

cm = confusion_matrix(y_test_labels, y_test_pred_labels)
plot_confusion_matrix(cm, class_set);

#### Compare to linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def linear_scoring(X_train, X_test, y_train, y_test):
    md_lin = LinearRegression()
    md_lin.fit(X_train, y_train)
    return accuracy_score(md_lin.predict(X_test).argmax(axis=1), y_test.argmax(axis=1))

In [None]:
def knn_scoring(X_train, X_test, y_train, y_test, k=5):
    md_nn = NearestNeighbors(n_neighbors=k)
    md_nn.fit(X_train)
    neighbors = md_nn.kneighbors(X_test, return_distance=False)
    y_test_pred = y_train[neighbors].sum(axis=1).argmax(axis=1)
    y_test_max = y_test.argmax(axis=1)
    return (accuracy_score(y_test_pred, y_test_max))

### Compare results

Test against linear regression

In [None]:
knn_scoring(X_train, X_test, y_train, y_test, k=1)

In [None]:
linear_scoring(X_train, X_test, y_train, y_test)

---

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
knn_score = []
Nlook = []

y_test_labels = labels[y_test.argmax(axis=1)]

for i in range(1, 50):
    sc = knn_scoring(X_train, X_test, y_train, y_test, k=i)
    knn_score.append(sc)
    Nlook.append(i)

fig, ax = plt.subplots(figsize=(10, 5))
plt.plot(Nlook, knn_score)
sc0 =  linear_scoring(X_train, X_test, y_train, y_test)
ax.plot(Nlook, np.ones(len(Nlook))*sc0)

#### Testing on a smaller subset

In [None]:
ncols = 10

index = np.random.choice(np.arange(X.shape[1]), ncols)
index

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:, index], y, test_size=0.33)

In [None]:
knn_score = []
Nlook = []

y_test_labels = labels[y_test.argmax(axis=1)]

for i in range(1, 40):
    sc = knn_scoring(X_train, X_test, y_train, y_test, k=i)
    knn_score.append(sc)
    Nlook.append(i)

fig, ax = plt.subplots(figsize=(10, 5))
plt.plot(Nlook, knn_score)
sc0 =  linear_scoring(X_train, X_test, y_train, y_test)
ax.plot(Nlook, np.ones(len(Nlook))*sc0)

---

### Distance weighting


Instead of tuning $k$, you can weight the neighbors according to the distance.

So you don't care how many neighbors you need (but you have to carefully choose the distance

In [None]:
ncols = 10

index = np.random.choice(np.arange(X.shape[1]), ncols)
index

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:,index], y, test_size=0.33)

In [None]:
Score = []
Nlook = []

for i in range(1, 20):
    
    md_nn = NearestNeighbors(n_neighbors=i)
    md_nn.fit(X_train)
    y_test_labels = labels[y_test.argmax(axis=1)]

    dist, neighbors = md_nn.kneighbors(X_test)
    
    # Gaussian distance
    std = 1
    
    arr = dist
    Arr = np.exp(- (arr/std)**2)
    Arr = Arr / Arr.sum()

    y_test_pred = (y_train[neighbors].T * Arr.T).mean(axis=1).argmax(axis=0)
    
    sc = accuracy_score(y_test_pred, y_test.argmax(axis=1))
    Score.append(sc)
    Nlook.append(i)

fig, ax = plt.subplots(figsize=(10, 5))
sc0 =  linear_scoring(X_train, X_test, y_train, y_test)

ax.plot(Nlook, Score)
ax.plot(Nlook, np.ones(len(Nlook))*sc0)

# Not safe

In [None]:
y

In [None]:
# Keep only columns with non nan values
X = data[numeric_no_miss].values
X = X/ X.std(axis=0)
y = pd_Y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66)

In [None]:
Score = []
Nlook = []

for i in range(2, 20):
    
    md_nn = NearestNeighbors(n_neighbors=i)
    md_nn.fit(X_train)
    y_test_labels = labels[y_test.argmax(axis=1)]

    dist, neigbors = md_nn.kneighbors(X_test)
    #std = md_nn.kneighbors(X_train)[0][:, 1:].std()
    std = 1
    
    # Calculate distance
    # TODO: try your own
    arr = dist
    Arr = np.exp(- (arr/std)**2)
    Arr = Arr / Arr.sum()

    y_test_pred_labels = labels[(y_train[neigbors].T * Arr.T).sum(axis=1).argmax(axis=0)]
    
    sc = accuracy_score(y_test_labels, y_test_pred_labels)
    Score.append(sc)
    Nlook.append(i)
    
plt.plot(Nlook, Score, "o")

---

## Regression of KNN

## Try to reconstruct the dataset

We have $1080$ entries, with $552$ which are complete.

We can try to predict the rest

In [None]:
print("Total dataset:")
print(data.shape)
print("Dataset without entries with missing values:")
print(data.dropna().shape)
print("Dataset without columns with missing values:")
print(data.T.dropna().T.shape)

In [None]:
X_train_limited = data[numeric_no_miss].loc[index_comlete].values
X_train = data[numeric_columns].loc[index_comlete].values
y_train = pd.get_dummies(data.loc[index_comlete]["class"]).values

In [None]:
X_test_limited = data[numeric_no_miss].loc[index_missing].values
X_test = data[numeric_columns].loc[index_missing].values
y_test = pd.get_dummies(data.loc[index_missing]["class"]).values

##### Normalize (look with / without)

In [None]:
std_1 = data[numeric_no_miss].std().values
std_2 = data[numeric_columns].std().values

X_test_limited  = X_test_limited / std_1
X_train_limited = X_train_limited / std_1

X_test  = X_test / std_2
X_train = X_train / std_2

#### Create the network

In [None]:
md = NearestNeighbors(20)
md.fit(X_train_limited)

distance, neighbors = md.kneighbors(X_test_limited)

In [None]:
std = 1

In [None]:
arr = np.exp(-(distance**2)/std)
Z   = np.sum(arr, axis=1)
arr = (arr.T/Z).T

In [None]:
X_test_pred = np.zeros((arr.shape[0], X_train.shape[1]))

for (i, x,y) in zip(np.arange(len(X_pred)), arr, X_train[neighbors]):
    X_test_pred[i] = (y.T * x).sum(axis=1)    

##### Look at results

In [None]:
item = 230

fig, ax = plt.subplots(figsize=(15,5))
plt.plot(X_test[item])
plt.plot(X_test_pred[item])

##### Look at distribution difference

In [None]:
d0 = 7
d1 = 6

plt.plot(X_train.T[d0], X_train.T[d1], "o")
plt.plot(X_test.T[d0], X_test.T[d1], "o")
