In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Lecture 6: Working With Data

Today we will cover the following topics:
- Data exploration
- Recentering and rescaling
- Dimenstionality Reduction

## Part 1:  Data Exploration

In [None]:
mean1 = [7,6,200,1000]
cov1 = [[10,   5,  0, 0],
        [15,  20,  0, 0],
        [ 0, 100, 10, 0],
        [ 0,   0,  0,50]]
data1 = np.random.multivariate_normal(mean1,cov1,1000)

mean2 = [7,6,150,1000]
cov2 = [[10,  5,  0, 0],
        [15, 20,  0, 0],
        [0, 100, 30, 0],
        [0,   0,  0,50]]
data2 = np.random.multivariate_normal(mean2,cov2,1000)

mean3 = [7,40,150,1000]
cov3 = [[10,  5,  0, 0],
        [15, 20,  0, 0],
        [0, 100, 30, 0],
        [0,   0,  0,50]]
data3 = np.random.multivariate_normal(mean3,cov3,1000)

In [None]:
data = np.concatenate([data1,data2,data3])

In [None]:
plt.plot(data[:1000,0], data[:1000,1],'bo',alpha=0.5)
plt.plot(data[1000:2000,0], data[1000:2000,1],'r+',alpha=0.5)
plt.plot(data[2000:,0], data[2000:,1],'ms',alpha=0.5)

In [None]:
plt.plot(data[:,0], data[:,1],'bo')

In [None]:
def inspect(data):
    _, nc = data.shape
    fig, ax = plt.subplots(nc,nc,figsize=(10,10))
    for i in range(nc):
        for j in range(nc):
            if i != j:
                ax[i][j].scatter(data[:,i], data[:,j])
            else:
                ax[i][j].hist(data[:,i])

In [None]:
inspect(data)

## Part 2: Recentering and Rescaling

In [None]:
d2 = data - np.mean(data,axis=0)

In [None]:
inspect(d2)

In [None]:
d3 = d2 / np.std(d2,axis=0)

In [None]:
inspect(d3)

In [None]:
d4 = d2 / np.max(d2,axis=0)

In [None]:
inspect(d4)

## Part 3: Dimensionality Reduction

In [None]:
import sklearn.decomposition
pca = sklearn.decomposition.PCA()

In [None]:
pca.fit(data)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
data_f = pca.transform(data)

In [None]:
inspect(data_f)

In [None]:
plt.plot(data_f[:,0], data_f[:,1], 'r+')

In [None]:
plt.plot(data_f[:1000,0], data_f[:1000,1], 'r+')
plt.plot(data_f[1000:2000,0], data_f[1000:2000,1], 'b+')
plt.plot(data_f[2000:,0], data_f[2000:,1], 'ms')