sauces:
* https://github.com/gallantlab/pyrcca
* https://online.stat.psu.edu/stat505/lesson/13

# pyrcca Example

In [19]:
import numpy as np
import pandas as pd
import rcca

from sklearn.cross_decomposition import CCA as sklearn_cca

np.random.seed(1)

### Simulate data

In [None]:
# manufacture data
n = 50

# Define two latent variables (number of samples x 1)
latvar1 = np.random.randn(n,)
latvar2 = np.random.randn(n,)

# Define independent components for each dataset (number of observations x dataset dimensions)
indep1 = np.random.randn(n, 4)
indep2 = np.random.randn(n, 5)

# Create two datasets, with each dimension composed as a sum of 75% one of the latent variables and 25% independent component
data1 = 0.4*indep1 + 0.6*np.vstack((latvar1, latvar2, latvar1, latvar2)).T
data2 = 0.4*indep2 + 0.6*np.vstack((latvar1, latvar2, latvar1, latvar2, latvar1)).T

# Split each dataset into two halves: training set and test set
train1 = data1[:n//2]
train2 = data2[:n//2]
test1 = data1[n//2:]
test2 = data2[n//2:]


### Run rCCA

In [2]:
# Create a cca object as an instantiation of the CCA object class. 
cca = rcca.CCA(kernelcca = False, reg = 0., numCC = 2)

# Use the train() method to find a CCA mapping between the two training sets.
cca.train([train1, train2])

# Use the validate() method to test how well the CCA mapping generalizes to the test data.
# For each dimension in the test data, correlations between predicted and actual data are computed.
testcorrs = cca.validate([test1, test2])

NameError: name 'train1' is not defined

### Or, Load Example Sales Data

In [3]:
X = pd.read_csv("/Users/don/Documents/scratch-data/sales_x.csv").values
Y = pd.read_csv("/Users/don/Documents/scratch-data/sales_y.csv").values

In [57]:
cca = rcca.CCA(kernelcca = False, reg = 0., numCC = 3)

# Use the train() method to find a CCA mapping between the two training sets.
model = cca.train([X, Y])

Training CCA, kernel = None, regularization = 0.0000, 3 components


In [58]:
model.cancorrs

array([0.92173002, 0.90924216, 0.2944548 ])

In [16]:
model.ws

[array([[-0.00165318, -0.01140646,  0.04058497],
        [-0.00189358,  0.0200943 , -0.01542565],
        [ 0.00260144, -0.01004899, -0.02311065]]),
 array([[ 0.00025634, -0.00938675, -0.01170823],
        [-0.00325668,  0.01343252, -0.02088458],
        [-0.00109514, -0.03808312,  0.02092013],
        [-0.00142048,  0.01010637,  0.0068389 ]])]

# With SKLearn

In [52]:
cca = sklearn_cca(n_components=3)
cca.tol=1e-15

In [53]:
cca.fit(X, Y)
#X_c, Y_c = cca.transform(X, Y)


CCA(copy=True, max_iter=500, n_components=3, scale=True, tol=1e-15)

In [54]:
cca.x_weights_

array([[ 0.7325559 , -0.248067  , -0.63389637],
       [ 0.33909041,  0.94045185,  0.02383294],
       [ 0.59023685, -0.23240714,  0.7730507 ]])

In [55]:
cca.y_weights_

array([[ 0.36757896, -0.50711532,  0.63670697],
       [ 0.13880809,  0.40346077, -0.34840374],
       [ 0.25566713, -0.67936549, -0.68781926],
       [ 0.88331893,  0.34426153, -0.01112401]])

In [56]:
cca.cancorrs

AttributeError: 'CCA' object has no attribute 'cancorrs'