In [1]:
#@title Licensed under the BSD-3 License (the "License"); { display-mode: "form" }
# Copyright 2021 Google LLC.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
#    may be used to endorse or promote products derived from this software without
#    specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

In [2]:
import numpy as np
import pandas as pd
import scipy
from sklearn import decomposition

# PCA

For PCA we require population-level data. We assume `data_matrix` is a Pandas dataframe whose rows correspond to individuals and columns correspond to data points. We simulate this data in this notebook as we don't have access to the real population-level data.

In [3]:
np.random.seed(42)
data_matrix = pd.DataFrame(np.random.normal(size=(10000, 1000)))

In [4]:
def standardize_df(df: pd.DataFrame) -> pd.DataFrame:
  """Standardizes a dataframe (mean=0, var=1)."""
  return (df - df.mean()) / df.std(ddof=0)


def generate_pc(
    data_matrix: pd.DataFrame, num_pc: int, standardize: bool = True
) -> pd.DataFrame:
  """Generates principal components (PCs) of the given data matrix.

  Args:
    data_matrix: The data matrix.
    num_pc: The number of PCs to compute.
    standardize: True to standardize the data matrix before computing PCs.

  Returns:
    A matrix of PCs of the data matrix.
  """
  original_shape = data_matrix.shape
  if standardize:
    data_matrix = standardize_df(data_matrix)
    # Replace NaN values with 0 (this can happen when some col has var=0).
    data_matrix.fillna(0, inplace=True)
    assert data_matrix.shape == original_shape
  pca = decomposition.PCA(num_pc)
  pc_np = pca.fit_transform(data_matrix)
  print('PCA explained variance:', pca.explained_variance_)
  print(
      'PCA explained variance (proportion):',
      pca.explained_variance_ / np.sum(pca.explained_variance_),
  )
  assert pc_np.shape == (original_shape[0], num_pc)
  return pd.DataFrame(pc_np)

In [5]:
pc_dataframe = generate_pc(
    data_matrix,
    num_pc=5)

pc_dataframe.head()

PCA explained variance: [1.63972209 1.63070323 1.62260396 1.61134043 1.590792  ]
PCA explained variance (proportion): [0.20255582 0.20144171 0.2004412  0.19904981 0.19651145]


Unnamed: 0,0,1,2,3,4
0,-2.371899,-0.643403,-0.397528,0.505243,-1.67212
1,-0.389563,-0.316097,-0.054947,-1.539366,-0.998421
2,-0.278895,-1.904815,0.019068,-0.700896,0.973568
3,3.261174,-0.036879,2.362755,-1.733982,0.587677
4,0.172324,0.537071,-0.351281,-1.236673,1.708548


# Spline fitting

In [6]:
def compute_spline_coefficients(
    arr: np.ndarray, knot_position: int
) -> np.ndarray:
  """Gets cubic spline coefficients with a single knot.

  We use a single knot which is padded by 4 (= k + 1) boundaries on each side,
  where k=3 (cubic) is the degree in this case.

  The results are 5 coefficients padded by 4 zeros at the end. We remove the
  last 4 zeros.

  For more details, see https://en.wikipedia.org/wiki/B-spline and
  https://docs.scipy.org/doc/scipy/tutorial/interpolate/smoothing_splines.html#procedural-splrep

  Args:
    arr: The target numpy array for 1D spline fitting.
    knot_position: The position of the single knot.

  Returns:
    A numpy array of 5 cubic spline coefficients.
  """
  num_points = len(arr)
  assert arr.shape == (num_points,)
  assert 0 < knot_position < num_points - 1
  spline = scipy.interpolate.splrep(
      x=np.arange(num_points),
      y=arr,
      k=3,
      task=-1,
      t=[knot_position],
  )
  bspline_coefficients = spline[1]
  assert np.array_equal(bspline_coefficients[5:], np.array([0, 0, 0, 0]))
  return bspline_coefficients[:5]

In [7]:
MAX_NUM_POINTS = 1000
VOLUME_SCALE_FACTOR = 0.001
KNOT_POSITION = 199

`example_curve` variable below should be a 1D numpy array that contains a single curve, such as a spirogram.

Here we use an example curve copied from a UK Biobank example at https://biobank.ctsu.ox.ac.uk/crystal/ukb/examples/eg_spiro_3066.dat

In [8]:
example_curve_txt = '0,0,0,0,3,10,25,54,101,169,258,363,478,589,689,785,879,970,1059,1147,1234,1320,1403,1486,1569,1650,1730,1809,1888,1965,2040,2116,2188,2261,2331,2400,2465,2532,2595,2658,2720,2780,2838,2894,2948,3001,3052,3102,3151,3197,3243,3287,3329,3371,3412,3451,3490,3527,3564,3600,3635,3670,3703,3736,3769,3800,3831,3861,3890,3918,3947,3974,4001,4028,4054,4080,4105,4130,4154,4179,4202,4226,4249,4271,4292,4312,4332,4351,4371,4390,4408,4426,4444,4461,4478,4495,4512,4528,4544,4560,4575,4590,4604,4619,4633,4647,4661,4675,4689,4703,4716,4729,4742,4755,4767,4779,4791,4802,4812,4822,4831,4840,4849,4857,4866,4874,4882,4890,4898,4906,4914,4921,4929,4936,4944,4951,4958,4966,4973,4980,4987,4994,5000,5007,5013,5020,5026,5033,5039,5045,5051,5057,5063,5069,5075,5081,5087,5092,5098,5104,5109,5114,5119,5125,5130,5134,5139,5144,5148,5153,5157,5161,5166,5170,5174,5178,5182,5186,5190,5194,5198,5202,5205,5209,5213,5216,5220,5223,5226,5230,5233,5236,5240,5243,5246,5250,5253,5256,5259,5262,5264,5267,5270,5273,5276,5279,5283,5286,5289,5292,5295,5298,5300,5303,5306,5308,5311,5314,5316,5319,5321,5323,5326,5328,5331,5333,5335,5338,5340,5343,5345,5348,5350,5352,5355,5357,5360,5362,5365,5367,5369,5372,5374,5377,5379,5381,5384,5386,5388,5390,5391,5393,5395,5397,5399,5401,5403,5404,5406,5408,5410,5412,5413,5415,5417,5419,5420,5422,5424,5426,5427,5429,5431,5432,5434,5436,5438,5439,5441,5443,5444,5446,5447,5449,5450,5452,5453,5455,5456,5457,5459,5460,5461,5462,5463,5464,5466,5467,5468,5470,5471,5473,5474,5476,5477,5478,5480,5481,5482,5484,5485,5486,5487,5489,5490,5491,5492,5493,5494,5496,5497,5498,5499,5500,5501,5502,5503,5504,5505,5506,5507,5508,5509,5510,5510,5511,5512,5513,5514,5515,5515,5516,5517,5519,5520,5521,5523,5524,5525,5527,5529,5530,5532,5533,5535,5536,5537,5539,5540,5541,5543,5544,5545,5545,5546,5547,5548,5549,5549,5550,5551,5552,5552,5553,5554,5554,5555,5556,5557,5557,5558,5559,5560,5560,5561,5562,5562,5563,5564,5564,5565,5565,5566,5567,5567,5568,5569,5570,5571,5572,5573,5574,5576,5577,5578,5579,5580,5582,5583,5584,5585,5587,5588,5589,5590,5591,5591,5592,5593,5594,5595,5596,5596,5597,5598,5598,5599,5600,5601,5601,5602,5603,5603,5604,5605,5606,5606,5607,5608,5608,5609,5609,5609,5610,5611,5611,5612,5613,5613,5614,5615,5616,5616,5617,5618,5618,5619,5620,5621,5622,5623,5624,5624,5625,5626,5626,5627,5628,5628,5629,5629,5630,5630,5631,5632,5632,5633,5633,5634,5635,5635,5636,5637,5637,5638,5639,5639,5640,5641,5642,5642,5643,5644,5645,5645,5646,5647,5647,5648,5649,5649,5650,5651,5651,5652,5652,5653,5654,5654,5655,5656,5656,5657,5658,5658,5659,5660,5660,5661,5661,5662,5663,5663,5664,5664,5665,5665,5666,5666,5667,5667,5668,5668,5669,5669,5670,5670,5670,5671,5671,5672,5672,5672,5673,5673,5673,5673,5674,5674,5674,5675,5676,5676,5677,5677,5678,5678,5679,5679,5680,5681,5681,5682,5683,5683,5684,5684,5685,5686,5686,5687,5687,5688,5688,5688,5689,5689,5690,5690,5690,5691,5691,5692,5692,5692,5693,5693,5694,5694,5694,5695,5695,5695,5696,5696,5696,5696,5696,5696,5697,5697,5698,5698,5698,5699,5699,5699,5699,5700,5700,5700,5701,5701,5702,5702,5703,5703,5704,5704,5705,5705,5706,5706,5707,5707,5708,5709,5709,5710,5710,5711,5711,5712,5712,5712,5713,5713,5713,5714,5714,5714,5715,5715,5716,5716,5716,5717,5717,5717,5718,5718,5719,5719,5720,5720,5721,5721,5721,5722,5722,5722,5723,5723,5723,5723,5724,5724,5724,5725,5725,5725,5726,5726,5726,5727,5727,5728,5728,5729,5729,5729,5730,5730,5731,5732,5732,5733,5733,5734,5735,5735,5735,5736,5736,5736,5737,5737,5737,5738,5738,5738,5739,5739,5739,5739,5740,5740,5740,5741,5741,5741,5741,5741,5741,5742,5742,5742,5742,5742,5742,5742,5742,5742,5742,5741,5741,5740,5740,5740,5740,5739,5739,5739,5739,5739,5739,5740,5740,5740,5741,5742,5742,5743,5743,5744,5745,5745,5745,5746,5746,5747,5747,5748,5748,5748,5748,5748,5748,5749,5749,5749,5749,5749,5749,5749,5750,5750,5750,5750,5750,5751,5751,5751,5752,5752,5753,5753,5754,5754,5754,5755,5755,5756,5756,5756,5757,5757,5757,5758,5758,5758,5758,5759,5759,5759,5759,5759,5759,5759,5759,5759,5760,5760,5760,5761,5761,5761,5762,5762,5763,5763,5763,5764,5764,5764,5765,5765,5766,5766,5766,5767,5767,5767,5767,5767,5768,5768,5768,5768,5769,5769,5769,5770,5770,5770,5770,5770,5771,5771,5771,5771,5771,5772,5772,5772,5773,5773,5773,5774,5774,5774,5775,5775,5775,5776,5776,5777,5777,5777,5778,5778,5778,5778,5779,5779,5779,5779,5779,5779,5779,5779,5779,5780,5780,5780,5780,5780,5780,5780,5780,5780,5780,5780,5780,5780,5780,5779,5779,5779,5779,5779,5779,5779,5779,5779,5779,5779,5779,5779,5780,5780,5780,5780,5781,5781,5781,5782,5782,5782,5783,5783,5783,5784,5784,5784,5785,5785,5785,5785,5785,5786,5786,5786,5786,5786,5786,5786,5787,5787,5787,5788,5788,5788,5789,5789,5789,5790,5790,5790,5791,5791,5792,5792,5792,5793,5793,5793,5794,5794,5795,5795,5795,5796,5796,5796,5797,5797,5798,5798,5798,5798,5798,5799,5799,5799,5799,5800,5800,5800,5801,5801,5801,5801,5802,5802,5802,5802,5803,5803,5803,5803,5803,5803,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5804,5803,5804,5804,5804,5804,5804,5805,5805,5805,5805,5806,5806,5806,5806,5806,5806,5806,5806,5806,5806,5807,5807,5807,5807,5808,5808,5809,5809,5809,5810,5810,5810,5811,5811,5812,5812,5813,5813,5813,5814,5814,5815,5815,5815,5815,5816,5816,5816,5816,5817,5817,5817,5817,5817,5817,5817,5818,5818,5818,5818,5818,5818,5818,5819,5819,5819,5819,5819,5819,5819,5819,5819,5819,5820,5820,5820,5820,5820,5820,5820,5820,5820,5819,5820,5820,5820,5820,5820,5820,5820,5820,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5821,5820,5820,5820,5819,5819,5818,5818,5818,5817,5817,5817,5816,5816,5816,5816,5815,5815,5815,5816,5816,5816,5817,5817,5818,5819,5819,5820,5821,5822,5823,5823,5824,5825,5826,5827,5827,5828,5828,5829,5829,5829,5830,5830,5831,5831,5831,5831,5831,5832,5831,5832,5832,5832,5832,5832,5832,5832,5833,5833,5833,5833,5833,5833,5833,5834,5834,5834,5834,5834,5835,5835,5835,5835,5835,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5836,5835,5835,5835,5835,5834,5834,5834,5834,5833,5833,5833,5833,5833,5832,5832,5832,5832,5832,5832,5832,5832,5831'
example_curve = (
    np.array(example_curve_txt.split(',')[:MAX_NUM_POINTS], dtype=np.float32)
    * VOLUME_SCALE_FACTOR
)

The following code generates the 5 spline coefficients the this curve.

In [9]:
print(
    compute_spline_coefficients(arr=example_curve, knot_position=KNOT_POSITION)
)

[-0.08101105  5.14773236  5.63775992  5.81692895  5.78074777]
