# Lemma dilemma

*Creators: Christopher Schmitz & Magnus Struckmann*
## Table of contents

- Notebook summary
- Data visualization
- Data preparation
- Accuracy line chart
- Histogram of ratio projected data points to original data points

### Notebook summary

Based on the program developed in the lecture (SparseRandomProjections), analyze a database of your choice (but not exactly the same digits data as in the lecture) using random projections.
Study the accuracy (or a score of your choice that makes most sense for your data) as a function of the number of dimensions / features that survived the random projection.

In [None]:
# Import packages
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
from sklearn.datasets import fetch_olivetti_faces # ten different images of 40 distinct subjects, n = 400, dim = 4096
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Data visualization

In [None]:
X, y = fetch_olivetti_faces(return_X_y=True, shuffle=True) # for further details call 'print(data.DESCR)'

In [None]:
example = X[0]
example = example.reshape((64, 64))
fig = go.Figure(data=go.Heatmap(z=example, colorscale=[[0,'rgb(0, 0, 0)'], [1.0,'rgb(255, 255, 255)']]))
fig.update_layout(width=400, height=400)
fig.show()

### Data preparation

In [None]:
JL_min_dim = johnson_lindenstrauss_min_dim(400,eps=0.1)
print ("Johnson and Lindenstrauss k >=", JL_min_dim)

In [None]:
dists = euclidean_distances(X, squared=True).ravel()
nonzero = dists != 0 # select only non-identical samples pairs
dists = dists[nonzero]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1005)
model = LinearSVC(dual=False) # dual=False because: number of features > number of examples
model.fit(X_train, y_train)
baseline = metrics.accuracy_score(model.predict(X_test), y_test)

In [None]:
data_collection = []
for n_components in np.arange(25, 525, 25): # Step size number of components in random sparse projection
    n_components = int(round(n_components,0))
    
    rp = SparseRandomProjection(n_components=n_components, random_state=1005)
    X_rp = rp.fit_transform(X)
    X_train_rp = rp.fit_transform(X_train)
    X_test_rp = rp.transform(X_test)    
    
    model = LinearSVC(dual=False,random_state=1005)
    model.fit(X_train_rp, y_train)
    accuracy = metrics.accuracy_score(model.predict(X_test_rp), y_test)
    
    projected_dists = euclidean_distances(X_rp, squared=True).ravel()[nonzero]
    rates = projected_dists / dists
 
    step_dict = dict(n_components=n_components,
                     accuracy=accuracy,
                     projected_dists=projected_dists,
                     rates=rates)
    
    data_collection.append(step_dict)

### Accuracy line chart

In [None]:
fig = go.Figure()

y_rp = []
y_bl = []
x = []

for step_dict in data_collection:
    y_rp.append(step_dict['accuracy'])
    y_bl.append(baseline)
    x.append(step_dict['n_components'])

fig.add_trace(go.Scatter(x=x, 
                         y=y_rp,
                         mode='lines',
                         name='Random projection accuracy'))

fig.add_trace(go.Scatter(x=x, 
                         y=y_bl,
                         mode='lines',
                         name='Baseline accuracy'))

fig.update_layout(title={'text':'Model accuracy','y':0.9,'x':0.4,'xanchor':'center','yanchor':'top'},
                  xaxis_title="Number of components in random sparse projection",
                  yaxis_title="Accuracy")

fig.show()

### Histogram of ratio projected data points to original data points

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in data_collection:
    x = step_dict['rates']

    trace = go.Histogram(x=x,
                         xbins=dict(size=0.002),
                         visible=False)

    fig.update_xaxes(range=[0.5, 1.5])
    fig.update_yaxes(range=[0, 2100])
    
    fig.add_trace(trace)
        
# Make 5th trace visible
fig.data[5].visible = True

In [None]:
# Create and add slider
def create_sliders(data_collection):
    steps = []
    for i,step_dict in enumerate(data_collection):
        step = dict(
            method="restyle",
            args=["visible", [False] * len(data_collection)],
            label=str(step_dict['n_components']),
        )
        step["args"][1][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=5,
        currentvalue={"prefix": "N components: "},
        pad={"t":len(data_collection)},
        steps=steps
    )]
    return sliders

In [None]:
# Show plot
fig.update_layout(
    sliders=create_sliders(data_collection),
    xaxis=dict(title='Squared distances rate projected / original'),
    yaxis=dict(title='Distribuation of sample pairs'))

fig.show()